In [76]:
# imports
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from nltk import FreqDist, pos_tag
from nltk.tokenize import word_tokenize
from collections import Counter
from nltk.tokenize import RegexpTokenizer

# Import CountVectorizer and TFIDFVectorizer from feature_extraction.text.
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [3]:
pwd

'/Users/emilynaftalin/Data_Science/General Assembly/dsi/projects/project_3/code'

### Reading in Data 

In [92]:
def load_data(unclean_df):
    full_file = '../datasets/' + unclean_df + '.csv'
    df = pd.read_csv(full_file)[['label', 'merged']]
    return df

In [93]:
vikings = load_data('minnesotavikings')

In [94]:
vikings.head()

Unnamed: 0,label,merged
0,vikings,"Want to hate Rodgers,but.. Be just seems like ..."
1,vikings,"alright im bored, what are some good games to ..."
2,vikings,"CB Update: Going in to week 17, Gladney ranks ..."
3,vikings,"CB Update: Going in to week 17, Gladney ranks ..."
4,vikings,The Over 30 Club Which players will be 30+ in ...


In [7]:
vikings.shape

(200, 2)

In [85]:
johnprine = load_data('prine')

In [86]:
johnprine.head()

Unnamed: 0,label,merged
0,prine,Bill Murray on John Prine
1,prine,It's my cake day. Listen to John Prine for me....
2,prine,Here's a pic of my son next to a pic of my Unc...
3,prine,"1110 S. 1st Ave Maywood, IL 60153"
4,prine,Got this original pressing of Bruised Orange i...


In [87]:
johnprine.shape

(200, 2)

### Cleaning Data

In [8]:
# This method for cleaning data from Emmanuel 

def standardize_text(df, text_field):
    
    '''Removes special characters and hyperlinks from dataframe'''
    
    df[text_field] = df[text_field].str.replace(r"http\S+", "")
    df[text_field] = df[text_field].str.replace(r"http", "")
    df[text_field] = df[text_field].str.replace(r"@\S+", "")
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    df[text_field] = df[text_field].str.replace(r"@", "at")
    df[text_field] = df[text_field].str.lower()
    return df

In [102]:
dfs = [vikings, johnprine]

for df in dfs:
    standardize_text(df, 'merged')

#### Dealing with null values 

In [103]:
vikings.isnull().sum()

label     0
merged    0
tokens    0
dtype: int64

In [106]:
johnprine.isnull().sum()

label     0
merged    0
tokens    0
dtype: int64

Below, creating a function that removes rows with null values in the `merged` column if there are 10 or fewer null values. If the number of null values is greater than 10, it does not remove the rows and instead prints the count of the null values as an alert. 

In [107]:
# dropping null values 

def drop_nulls(df):
    
    '''Drops rows with null values if count is less than 11; otherwise prints the number of null values'''
    
    nulls = df.isnull().values.sum()
    if nulls <= 10:
        print (f'Dropping {nulls} rows that contained null values.')
        df = df.notna()
        
    else: 
        print (f'The dataframe contains {nulls} rows with null values so they were not dropped.')
        
# citation: credit for 'isnull().values' to this Stack Overflow discussion: 
# https://stackoverflow.com/questions/29530232/how-to-check-if-any-value-is-nan-in-a-pandas-dataframe

In [108]:
drop_nulls(vikings)

Dropping 0 rows that contained null values.


In [109]:
drop_nulls(johnprine)

Dropping 0 rows that contained null values.


#### Adding Column for Tokenized Text 

Use below convention to add column called `tokens` that is a tokenized version of the `merged` column.

```
tokenizer = RegexpTokenizer(r'\w+')

`df['tokens'] = df['merged'].apply(tokenizer.tokenize)

```

In [110]:
tokenizer = RegexpTokenizer(r'\w+')

dfs = [vikings, johnprine]

for df in dfs:
    df['tokens'] = df['merged'].apply(tokenizer.tokenize)

In [111]:
vikings.head()

Unnamed: 0,label,merged,tokens
0,vikings,"want to hate rodgers,but be just seems like ...","[want, to, hate, rodgers, but, be, just, seems..."
1,vikings,"alright im bored, what are some good games to ...","[alright, im, bored, what, are, some, good, ga..."
2,vikings,"cb update going in to week 17, gladney ranks ...","[cb, update, going, in, to, week, 17, gladney,..."
3,vikings,"cb update going in to week 17, gladney ranks ...","[cb, update, going, in, to, week, 17, gladney,..."
4,vikings,the over 30 club which players will be 30 in ...,"[the, over, 30, club, which, players, will, be..."


In [112]:
johnprine.head()

Unnamed: 0,label,merged,tokens
0,prine,bill murray on john prine,"[bill, murray, on, john, prine]"
1,prine,it's my cake day listen to john prine for me ...,"[it, s, my, cake, day, listen, to, john, prine..."
2,prine,here's a pic of my son next to a pic of my unc...,"[here, s, a, pic, of, my, son, next, to, a, pi..."
3,prine,"1110 s 1st ave maywood, il 60153","[1110, s, 1st, ave, maywood, il, 60153]"
4,prine,got this original pressing of bruised orange i...,"[got, this, original, pressing, of, bruised, o..."


Once clean-up is done, save clean dataframe to csv using code like the line below.  

`df.to_csv('../datasets/clean_df.csv', index=False)`

In [113]:
pwd

'/Users/emilynaftalin/Data_Science/General Assembly/dsi/projects/project_3/code'

In [115]:
# saving clean dataframe 

vikings.to_csv('../datasets/vikings_clean.csv', index=False)
johnprine.to_csv('../datasets/prine_clean.csv', index=False)