In [162]:
#imports
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
import eli5

import regex as re
import nltk
#nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier


from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier

from nltk.corpus import stopwords

import pickle


import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

In [163]:
# df_reddit = pickle.load(open('../DataSet/df_reddit_for_model.pkl', 'rb'))
df_reddit = pickle.load(open('../DataSet/df_reddit.pkl', 'rb'))
df_reddit['target'] = df_reddit['subreddit'].replace({"NASA": 1, "Space_discussion": 0})
df_reddit.head()

Unnamed: 0,author,created_utc,domain,id,num_comments,over_18,post_hint,score,text_merged,subreddit,target
0,illichian,1579413305,i.imgur.com,eqsltj,2,False,link,1,star shine saturn ring,NASA,1
1,itstie,1579412680,i.redd.it,eqsibf,0,False,Empty,1,smithsonian nation air space museum,NASA,1
2,NASA_POTD_bot,1579410507,apod.nasa.gov,eqs6cb,0,False,Empty,1,incred expand crab nebula,NASA,1
3,AMC-Eagle85,1579410277,i.redd.it,eqs4zd,6,False,Empty,1,columbia readi st,NASA,1
4,BorisTheSpacePerson,1579404939,i.redd.it,eqr7wu,0,False,Empty,1,went ksc christma got see made interest spacef...,NASA,1


Defining X and Y variables and use train test split function

In [164]:
X = df_reddit['text_merged']
y = df_reddit['target']

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.25,
                                                    stratify=y,
                                                    random_state = 101)

In [165]:
stopwords = nltk.corpus.stopwords.words('english')
newStopWords = ['http', 'would', 'com']
stopwords.extend(newStopWords)


# tvec = TfidfVectorizer()
cvec = CountVectorizer(stop_words=stopwords, min_df=4, max_df=1.0,
                       ngram_range=(1,2),max_features = 3000)

In [166]:
term_mat = cvec.fit_transform(df_reddit['text_merged'])

In [167]:
term_df = pd.DataFrame(term_mat.toarray(), 
                       columns=cvec.get_feature_names())
term_df.insert(0, 'targets', df_reddit['target'])

In [168]:
term_df.head()

Unnamed: 0,targets,aa,ab,abil,abl,abl see,aboard,aboard intern,aboard space,abort,...,york,youlikebet,young,youtu,zealand,zero,zero graviti,zone,zoom,zubrin
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [169]:
top_words_nasa = list(term_df.groupby('targets').
    mean().T.sort_values(1, ascending=False).head(20).index)

top_words_space_dis = list(term_df.groupby('targets').
    mean().T.sort_values(0, ascending=False).head(20).index)

top_words_overlap = [word for word in top_words_space_dis if word in top_words_nasa]

In [170]:
print('top_words_nasa\n\n', top_words_nasa,
      '\n\ntop_words_space_dis\n\n', top_words_space_dis,
      '\n\ntop_words_overlap\n\n', top_words_overlap)

top_words_nasa

 ['nasa', 'space', 'moon', 'apollo', 'earth', 'astronaut', 'mission', 'mar', 'year', 'launch', 'work', 'like', 'one', 'first', 'get', 'new', 'amp', 'time', 'go', 'station'] 

top_words_space_dis

 ['space', 'earth', 'nasa', 'moon', 'time', 'amp', 'star', 'year', 'planet', 'like', 'mar', 'launch', 'univers', 'could', 'new', 'hole', 'first', 'black', 'one', 'black hole'] 

top_words_overlap

 ['space', 'earth', 'nasa', 'moon', 'time', 'amp', 'year', 'like', 'mar', 'launch', 'new', 'first', 'one']


In [171]:
X_train_features = cvec.fit_transform(X_train)
X_test_features = cvec.transform(X_test)

lr = LogisticRegression(solver='sag', 
                        max_iter=3000)

lr.fit(X_train_features, y_train);

lr.score(X_train_features, y_train); 
print('train score', lr.score(X_train_features, y_train))

lr.score(X_test_features, y_test); 
print('test score', lr.score(X_test_features, y_test))

train score 0.8376666666666667
test score 0.7513333333333333


In [215]:
gboost = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, 
                                    n_estimators=100, subsample=1.0, criterion='friedman_mse', 
                                    min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                                    max_depth=3, min_impurity_decrease=0.0, min_impurity_split=None, 
                                    init=None, random_state=None, max_features=None, verbose=0, 
                                    max_leaf_nodes=None, warm_start=False, validation_fraction=0.1, 
                                    n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0)
gboost_params = {
    'max_depth': [2,3,4],
    'n_estimators': [100, 125, 150],
    'learning_rate': [.08, .1, .12]
}
gs = GridSearchCV(gboost, param_grid=gboost_params, cv=3)
gs.fit(X_train_features, y_train)

gs_M7 = GridSearchCV(gboost, param_grid=gboost_params, cv=3)
gs_M7.fit(X_train_features, y_train)

print(gs.best_score_)
gs.best_params_

0.7347777777777779


{'learning_rate': 0.12, 'max_depth': 4, 'n_estimators': 150}

In [218]:
gboost = GradientBoostingClassifier(loss='deviance', learning_rate=0.12, 
                                    n_estimators=150, subsample=1.0, criterion='friedman_mse', 
                                    min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                                    max_depth=4, min_impurity_decrease=0.0, min_impurity_split=None, 
                                    init=None, random_state=None, max_features=None, verbose=0, 
                                    max_leaf_nodes=None, warm_start=False, validation_fraction=0.1, 
                                    n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0)
gboost.fit(X_train_features, y_train)
print(gboost.score(X_test_features, y_test))

0.7343333333333333


In [None]:
mlp = MLPClassifier(hidden_layer_sizes=50, activation='relu', solver='adam', 
                    alpha=0.0001, batch_size='auto', learning_rate='constant', 
                    learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, 
                    random_state=None, tol=0.0001, verbose=False, warm_start=False, 
                    momentum=0.9, nesterovs_momentum=True, early_stopping=False, 
                    validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, 
                    n_iter_no_change=10, max_fun=15000)


mlp_params = {
    'hidden_layer_sizes': [20,50,100],
    'activation': ['identity', 'tanh', 'relu'],
    'solver': ['lbfgs', 'sgd', 'adam']
}
mlpg = GridSearchCV(mlp, param_grid=mlp_params, cv=3)
mlpg.fit(X_train_features, y_train)


print(mlpg.best_score_)
mlpg.best_params_

In [214]:
mlp.fit(X_train_features,y_train)
mlp.score(X_train_features, y_train)

mlp.score(X_test_features, y_test)

0.7033333333333334

In [177]:
preds = mlp.predict(X_test_features)

# Save confusion matrix values
tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()

# View confusion matrix

#plot_confusion_matrix(gs, X_test_features, y_test, cmap='Blues', values_format='d');

Accuracy = (tp + tn) / (tp + tn + fp + fn)
Precision = tp / (tp + fp)
Recall = tp / (tp + fn)

Accuracy, Precision, Recall

(0.7, 0.6950585175552666, 0.7126666666666667)