## Count Vectorization

In this notebook, I'll be converting my reddit data into vectors!

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
posts = pd.read_csv('./data/reddit_clean.csv')

In [3]:
# For some reason, CountVectorizer wasn't considering band and bands the same word. Fixing that here
posts['selftext'] = posts['selftext'].apply(lambda x: x.replace('bands', 'band') if 'bands' in x else x)

In [4]:
posts['selftext']

0       looking to find some crust punk peeps I can ha...
1       Maybe I'm overthinking it, but is the line   \...
2       Im new to punk culture and have been hearing t...
3       I (14f) wasn't punk last year, I was really in...
4       Hello, I’ve got a project for a customized lea...
                              ...                        
3821    This album has been getting a lot of love and ...
3822    I’m not the biggest fan of his music although ...
3823    Hey everyone, im currently moving and trying t...
3824    This band is overlooked. Give them some love p...
3825    Huge fan of finding unknown band in this space...
Name: selftext, Length: 3826, dtype: object

In [5]:
X = posts['selftext']
y = posts['subreddit']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=182, stratify=y) # our data is pretty close to even, but I still want to stratify just to be safe.

In [7]:
stop_words = [
    'https',
    'com',
    'www',
    'amp',
    'like',
    'just',
    'spotify',
]

In [8]:
cvec = CountVectorizer(stop_words=text.ENGLISH_STOP_WORDS.union(stop_words), # Thanks jonrsharpe on StackOverflow: https://stackoverflow.com/questions/24386489/adding-words-to-scikit-learns-countvectorizers-stop-list/24386751
#                        max_df = 0.98,
#                        min_df = 0.01,
                                           
                      )

In [9]:
X_train_cvec = cvec.fit_transform(X_train)

In [10]:
X_train_df = pd.DataFrame(X_train_cvec.toarray(), columns=cvec.get_feature_names())

#### I want to see what my most frequent words are to determine if I need to add any other words to my stop words

In [11]:
# empty dictionary
top_words = {}

# Loop through columns
for i in X_train_df.columns:
    # Save sum of each column in dictionary
    top_words[i] = X_train_df[i].sum()
    
# top_words to dataframe sorted by highest occurence
most_freq = pd.DataFrame(sorted(top_words.items(),
                               key = lambda x: x[1], 
                               reverse = True))

In [12]:
most_freq.head(10)

Unnamed: 0,0,1
0,album,2511
1,punk,2073
2,open,2006
3,si,1921
4,band,1609
5,know,801
6,music,714
7,pop,681
8,ve,656
9,new,625


In [13]:
X_train_df.shape

(2869, 19416)

In [14]:
y.value_counts(normalize=True)

poppunkers    0.500784
punk          0.499216
Name: subreddit, dtype: float64

In [384]:
max_df = [0.8, 0.6]
min_df = [0.1, 0.05]
ngram_range = [(1, 1), (1, 2)]
max_features = [1000, 3000, 2000]

In [385]:
# Count Vectorizer and Naive Bayes
pipe_cvec_nb = Pipeline([
    ('cvec', CountVectorizer()),
    ('nb', MultinomialNB())
])

# Setting my parameters
params_cvec_nb = {
    'cvec__max_df' : max_df,
    'cvec__min_df' : min_df,
    'cvec__max_features' : max_features,
    'cvec__ngram_range' : ngram_range
}

In [386]:
# Count Vectorizer and Logistic Regression
pipe_cvec_lr = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression(max_iter=2000))
])

params_cvec_lr = {
    'cvec__max_df' : max_df,
    'cvec__min_df' : min_df,
    'cvec__max_features' : max_features,
    'cvec__ngram_range' : ngram_range
}

In [387]:
# Tfidf Vectorizer and Naive Bayes
pipe_tvec_nb = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('nb', MultinomialNB()),
])

params_tvec_nb = {
    'tvec__max_df' : max_df,
    'tvec__min_df' : min_df,
    'tvec__max_features' : max_features,
    'tvec__ngram_range' : ngram_range
}

In [388]:
# Tfidf Vectorizer and Logistic Regression
pipe_tvec_lr = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('lr', LogisticRegression())
])

params_tvec_lr = {
    'tvec__max_df' : max_df,
    'tvec__min_df' : min_df,
    'tvec__max_features' : max_features,
    'tvec__ngram_range' : ngram_range
}

In [389]:
gs_cvec_nb = GridSearchCV(pipe_cvec_nb,
                         param_grid=params_cvec_nb,
                         cv=5,
                         n_jobs = 16)

In [390]:
gs_cvec_lr = GridSearchCV(pipe_cvec_lr,
                         param_grid=params_cvec_lr,
                         cv=5,
                         n_jobs = 16)

In [391]:
gs_tvec_nb = GridSearchCV(pipe_tvec_nb,
                         param_grid=params_tvec_nb,
                         cv=5,
                         n_jobs = 16)

In [392]:
gs_tvec_lr = GridSearchCV(pipe_tvec_lr,
                         param_grid=params_tvec_lr,
                         cv=5,
                         n_jobs = 16)

In [393]:
gs_cvec_nb.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('nb', MultinomialNB())]),
             n_jobs=16,
             param_grid={'cvec__max_df': [0.8, 0.6],
                         'cvec__max_features': [1000, 3000, 2000],
                         'cvec__min_df': [0.1, 0.05],
                         'cvec__ngram_range': [(1, 1), (1, 2)]})

In [394]:
gs_cvec_lr.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('lr',
                                        LogisticRegression(max_iter=2000))]),
             n_jobs=16,
             param_grid={'cvec__max_df': [0.8, 0.6],
                         'cvec__max_features': [1000, 3000, 2000],
                         'cvec__min_df': [0.1, 0.05],
                         'cvec__ngram_range': [(1, 1), (1, 2)]})

In [395]:
gs_tvec_nb.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                       ('nb', MultinomialNB())]),
             n_jobs=16,
             param_grid={'tvec__max_df': [0.8, 0.6],
                         'tvec__max_features': [1000, 3000, 2000],
                         'tvec__min_df': [0.1, 0.05],
                         'tvec__ngram_range': [(1, 1), (1, 2)]})

In [396]:
gs_tvec_lr.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                       ('lr', LogisticRegression())]),
             n_jobs=16,
             param_grid={'tvec__max_df': [0.8, 0.6],
                         'tvec__max_features': [1000, 3000, 2000],
                         'tvec__min_df': [0.1, 0.05],
                         'tvec__ngram_range': [(1, 1), (1, 2)]})

In [397]:
gs_tvec_nb.best_estimator_

Pipeline(steps=[('tvec',
                 TfidfVectorizer(max_df=0.6, max_features=1000, min_df=0.1,
                                 ngram_range=(1, 2))),
                ('nb', MultinomialNB())])

In [398]:
print('TRAINING SCORES')
print('===================')
print(f'Count Vectorizer with Naive Bayes: {gs_cvec_nb.score(X_train, y_train)}')
print(f'Count Vectorizer with Logistic Regression: {gs_cvec_lr.score(X_train, y_train)}')
print(f'Tfidf Vectorizer with Naive Bayes: {gs_tvec_nb.score(X_train, y_train)}')
print(f'Tfidf Vectorizer with Logistic Regression: {gs_tvec_lr.score(X_train, y_train)}')

print('TESTING SCORES')
print('==================')
print(f'Count Vectorizer with Naive Bayes: {gs_cvec_nb.score(X_test, y_test)}')
print(f'Count Vectorizer with Logistic Regression: {gs_cvec_lr.score(X_test, y_test)}')
print(f'Tfidf Vectorizer with Naive Bayes: {gs_tvec_nb.score(X_test, y_test)}')
print(f'Tfidf Vectorizer with Logistic Regression: {gs_tvec_lr.score(X_test, y_test)}')

TRAINING SCORES
Count Vectorizer with Naive Bayes: 0.649355176019519
Count Vectorizer with Logistic Regression: 0.7685604740327641
Tfidf Vectorizer with Naive Bayes: 0.7159288950853956
Tfidf Vectorizer with Logistic Regression: 0.7960962007668178
TESTING SCORES
Count Vectorizer with Naive Bayes: 0.6311389759665622
Count Vectorizer with Logistic Regression: 0.7439916405433646
Tfidf Vectorizer with Naive Bayes: 0.6677115987460815
Tfidf Vectorizer with Logistic Regression: 0.7460815047021944
