In [378]:
# imports
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk import FreqDist, pos_tag
from nltk.tokenize import word_tokenize
from collections import Counter
from nltk.tokenize import RegexpTokenizer
import nltk 
from nltk.stem.snowball import SnowballStemmer 

# Import CountVectorizer and TFIDFVectorizer from feature_extraction.text.
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import pickle 

In [206]:
pwd

'/Users/emilynaftalin/Data_Science/General Assembly/dsi/projects/project_3/code'

### Reading in Data 

In [318]:
def load_data(unclean_df):
    full_file = '../datasets/' + unclean_df + '.csv'
    df = pd.read_csv(full_file)[['label', 'merged']]
    return df

In [352]:
country = load_data('country')
country.head()

Unnamed: 0,label,merged
0,country,Country Suggestions I've been starting to get ...
1,country,Hank Williams Sr. - I'm So Lonesome I Could Cr...
2,country,Cody Jinks - Loud and Heavy
3,country,"Southern Raised Performs ""What A Day That Will..."
4,country,Is anybody still making Texas Swing? My father...


In [354]:
rock = load_data('rock')
rock.head()

Unnamed: 0,label,merged
0,rock,SRV - Hideaway &amp; Rude Mood (Montreux '82)
1,rock,Them Crooked Vultures - New Fang
2,rock,"Time Spent Driving - “Hey, You Dropped Somethi..."
3,rock,Disturbed - The Sound Of Silence (Live on Conan)
4,rock,Starset - Let It Die (2016)


_**CITATION:**_ 

For the Logistic Regression exploration in this and future notebooks, I have drawn huge insights and inspiration from this [NLP workshop by Emmanuel Ameisen (@EmmanuelAmeisen), from Insight AI](https://github.com/EmFib/concrete_NLP_tutorial/blob/master/NLP_notebook.ipynb). 

I have implemented his code in several places and cite it there with the shorthand reference _"Emmanuel"_. 

### Cleaning Data

In [325]:
# This method for cleaning data from Emmanuel 

def standardize_text(df, text_field):
    
    '''Removes special characters and hyperlinks from dataframe'''
    
    df[text_field] = df[text_field].str.replace(r"http\S+", "")
    df[text_field] = df[text_field].str.replace(r"http", "")
    df[text_field] = df[text_field].str.replace(r"@\S+", "")
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?@\`\"\_\n]", " ")
    df[text_field] = df[text_field].str.replace(r"'", "")
    df[text_field] = df[text_field].str.replace(r"\s+[a-zA-Z]\s+", " ")
    df[text_field] = df[text_field].str.replace(r"@", "at")
    df[text_field] = df[text_field].str.lower()
    return df

In [356]:
dfs = [country, rock]

for df in dfs:
    standardize_text(df, 'merged')

In [357]:
# checking what it looks like after cleaning 
country.head()

Unnamed: 0,label,merged
0,country,country suggestions ve been starting to get in...
1,country,hank williams sr m so lonesome could cry (1949)
2,country,cody jinks loud and heavy
3,country,"southern raised performs ""what day that will b..."
4,country,is anybody still making texas swing? my father...


Once clean-up is done, save cleaned and stemmed dataframe to csv using code like the line below.  

`df.to_csv('../datasets/clean_df.csv', index=False)`

In [358]:
country.to_csv('../datasets/country_clean.csv', index=False)
rock.to_csv('../datasets/rock_clean.csv', index=False)

#### Dealing with null values 

In [359]:
country.isnull().sum()

label     0
merged    0
dtype: int64

In [360]:
rock.isnull().sum()

label     0
merged    0
dtype: int64


Below, creating a function that removes rows with null values in the `merged` column if there are 10 or fewer null values. If the number of null values is greater than 10, it does not remove the rows and instead prints the count of the null values as an alert. 

In [361]:
# dropping null values 

def drop_nulls(df):
    
    '''Drops rows with null values if count is less than 11; otherwise prints the number of null values'''
    
    nulls = df.isnull().values.sum()
    if nulls <= 10:
        print (f'Dropping {nulls} rows that contained null values.')
        df = df.notna()
        
    else: 
        print (f'The dataframe contains {nulls} rows with null values so they were not dropped.')
        
# citation: credit for 'isnull().values' to this Stack Overflow discussion: 
# https://stackoverflow.com/questions/29530232/how-to-check-if-any-value-is-nan-in-a-pandas-dataframe

### Pre-Processing Data

#### Adding Column for Tokenized Text 

Use below convention to add column called `tokens` that is a tokenized version of the `merged` column.

```python
tokenizer = RegexpTokenizer(r'\w+')

df['tokens'] = df['merged'].apply(tokenizer.tokenize)

```

#### Adding Column for Stemmed Words

Use the below convention to add a column called `stemmed` that will have the tokenized words that have been reduced to their root forms. I will use the [Snowball Stemmer](https://snowballstem.org/) for the English language. 

```python
snow_stemmer = SnowballStemmer(language='english') 
df['stemmed'] = df['tokens'].apply(lambda x: [snow_stemmer.stem(y) for y in x])
```

In [362]:
# this control flow will handle nulls, tokenize, and stem the dataframes 

tokenizer = RegexpTokenizer(r'\w+')

dfs = [country, rock]

for df in dfs:
    drop_nulls(df)
    df['tokens'] = df['merged'].apply(tokenizer.tokenize)
    df['stemmed'] = df['tokens'].apply(lambda x: [snow_stemmer.stem(y) for y in x])   

Dropping 0 rows that contained null values.
Dropping 0 rows that contained null values.


In [373]:
country['tokens'] = country['merged'].apply(tokenizer.tokenize)

In [376]:
type(country['tokens'][1])

list

In [374]:
country.head()

Unnamed: 0,label,merged,tokens,stemmed,combo
0,country,country suggestions ve been starting to get in...,"[country, suggestions, ve, been, starting, to,...","[countri, suggest, ve, been, start, to, get, i...",countri suggest ve been start to get into coun...
1,country,hank williams sr m so lonesome could cry (1949),"[hank, williams, sr, m, so, lonesome, could, c...","[hank, william, sr, m, so, lonesom, could, cri...",hank william sr m so lonesom could cri 1949
2,country,cody jinks loud and heavy,"[cody, jinks, loud, and, heavy]","[codi, jink, loud, and, heavi]",codi jink loud and heavi
3,country,"southern raised performs ""what day that will b...","[southern, raised, performs, what, day, that, ...","[southern, rais, perform, what, day, that, wil...",southern rais perform what day that will be at...
4,country,is anybody still making texas swing? my father...,"[is, anybody, still, making, texas, swing, my,...","[is, anybodi, still, make, texa, swing, my, fa...",is anybodi still make texa swing my father in ...


In [364]:
# checking what it looks like after nulls/tokenize/stemming
rock.head()

Unnamed: 0,label,merged,tokens,stemmed
0,rock,srv hideaway amp rude mood (montreux 82),"[srv, hideaway, amp, rude, mood, montreux, 82]","[srv, hideaway, amp, rude, mood, montreux, 82]"
1,rock,them crooked vultures new fang,"[them, crooked, vultures, new, fang]","[them, crook, vultur, new, fang]"
2,rock,"time spent driving hey, you dropped somethi...","[time, spent, driving, hey, you, dropped, some...","[time, spent, drive, hey, you, drop, someth, 2..."
3,rock,disturbed the sound of silence (live on conan),"[disturbed, the, sound, of, silence, live, on,...","[disturb, the, sound, of, silenc, live, on, co..."
4,rock,starset let it die (2016),"[starset, let, it, die, 2016]","[starset, let, it, die, 2016]"


#### Bringing it all back together 

I use the .join() method to combine the stemmed tokens back into a single string so that it can be passed into word vectorizer in the modeling notebooks. 

_Citation: Sincere thanks to Heather Johannsen and James Opacich for helping with the below function._

In [365]:
new_col_c = []

for row in country['stemmed']:
    row = ' '.join(row)
    new_col_c.append(row)
    
country['combo'] = new_col_c

In [366]:
#checking
country.head()

Unnamed: 0,label,merged,tokens,stemmed,combo
0,country,country suggestions ve been starting to get in...,"[country, suggestions, ve, been, starting, to,...","[countri, suggest, ve, been, start, to, get, i...",countri suggest ve been start to get into coun...
1,country,hank williams sr m so lonesome could cry (1949),"[hank, williams, sr, m, so, lonesome, could, c...","[hank, william, sr, m, so, lonesom, could, cri...",hank william sr m so lonesom could cri 1949
2,country,cody jinks loud and heavy,"[cody, jinks, loud, and, heavy]","[codi, jink, loud, and, heavi]",codi jink loud and heavi
3,country,"southern raised performs ""what day that will b...","[southern, raised, performs, what, day, that, ...","[southern, rais, perform, what, day, that, wil...",southern rais perform what day that will be at...
4,country,is anybody still making texas swing? my father...,"[is, anybody, still, making, texas, swing, my,...","[is, anybodi, still, make, texa, swing, my, fa...",is anybodi still make texa swing my father in ...


In [367]:
new_col_r = []

for row in rock['stemmed']:
    row = ' '.join(row)
    new_col_r.append(row)
    
rock['combo'] = new_col_r

In [368]:
#checking
rock.head()

Unnamed: 0,label,merged,tokens,stemmed,combo
0,rock,srv hideaway amp rude mood (montreux 82),"[srv, hideaway, amp, rude, mood, montreux, 82]","[srv, hideaway, amp, rude, mood, montreux, 82]",srv hideaway amp rude mood montreux 82
1,rock,them crooked vultures new fang,"[them, crooked, vultures, new, fang]","[them, crook, vultur, new, fang]",them crook vultur new fang
2,rock,"time spent driving hey, you dropped somethi...","[time, spent, driving, hey, you, dropped, some...","[time, spent, drive, hey, you, drop, someth, 2...",time spent drive hey you drop someth 2015
3,rock,disturbed the sound of silence (live on conan),"[disturbed, the, sound, of, silence, live, on,...","[disturb, the, sound, of, silenc, live, on, co...",disturb the sound of silenc live on conan
4,rock,starset let it die (2016),"[starset, let, it, die, 2016]","[starset, let, it, die, 2016]",starset let it die 2016


In [384]:
# after all that, coming back to drop stemmed and combo columns since I definitely won't use them.

rock.drop(columns=['stemmed', 'combo'], inplace=True)
country.drop(columns=['stemmed', 'combo'], inplace=True)

In [385]:
country.head()

Unnamed: 0,label,merged,tokens
0,country,country suggestions ve been starting to get in...,"[country, suggestions, ve, been, starting, to,..."
1,country,hank williams sr m so lonesome could cry (1949),"[hank, williams, sr, m, so, lonesome, could, c..."
2,country,cody jinks loud and heavy,"[cody, jinks, loud, and, heavy]"
3,country,"southern raised performs ""what day that will b...","[southern, raised, performs, what, day, that, ..."
4,country,is anybody still making texas swing? my father...,"[is, anybody, still, making, texas, swing, my,..."


In [386]:
rock.head()

Unnamed: 0,label,merged,tokens
0,rock,srv hideaway amp rude mood (montreux 82),"[srv, hideaway, amp, rude, mood, montreux, 82]"
1,rock,them crooked vultures new fang,"[them, crooked, vultures, new, fang]"
2,rock,"time spent driving hey, you dropped somethi...","[time, spent, driving, hey, you, dropped, some..."
3,rock,disturbed the sound of silence (live on conan),"[disturbed, the, sound, of, silence, live, on,..."
4,rock,starset let it die (2016),"[starset, let, it, die, 2016]"


Once clean-up is done, pickle cleaned dataframe using code like the line below.  

`df.to_pickle('../datasets/clean_df.pkl')`

In [350]:
pwd

'/Users/emilynaftalin/Data_Science/General Assembly/dsi/projects/project_3/code'

In [369]:
# saving clean dataframe 

# country.to_csv('../datasets/country_stem_combo.csv', index=False)
# rock.to_csv('../datasets/rock_stem_combo.csv', index=False)

In [387]:
country.to_pickle('../datasets/country_token_pickle.pkl')
rock.to_pickle('../datasets/rock_token_pickle.pkl')

_**Note: After running a model, visualizing my data, and further inspecting the use of words, I decided that I do not want to stem the words before modeling and visualizing. SO, I will keep my cleaned up dataframes as is, but I will use my origined `merged` column instead of my deconstituted/reconstituted `combo` column to run my final models.**_