In [1]:
import requests
import time
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import ConfusionMatrixDisplay, f1_score, precision_score,recall_score
import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import plot_tree, DecisionTreeClassifier
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
import warnings
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn. compose import ColumnTransformer
import numpy as np

In [2]:
warnings.filterwarnings('ignore')

In [3]:
base_url = 'https://api.pushshift.io/reddit/search/'

In [4]:
pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)

### Pulling in Posts from Reddit Pushshift API

In [5]:
def pull_posts(base_url,subreddit,length):
    data_pulls = pd.DataFrame()
    before = 1650821963
    for i in range(length):
        parameters = {'subreddit': subreddit,
                  'size': 100,
                  'is_self' : True,
                  'before': before}
        try:
            response = requests.get(base_url+'submission/',parameters)
            if response.status_code != 200:
                print('request failed')
            else:
                data = response.json()
                posts = data['data']
                posts = pd.DataFrame(posts)
                data_pulls = pd.concat([data_pulls,posts])
                time.sleep(5)
                before = data_pulls['created_utc'].iloc[-1]
        except:
            print("Exception: {}".format(type(exception).__name__))
            print("Exception message: {}".format(exception))
            time.sleep(15)
    return data_pulls

In [6]:
#futurology_pulls = pull_posts(base_url,'futurology',289)
#futurology_pulls.to_csv('futurology_pull.csv',index=False)

In [7]:
#collapse_pulls = pull_posts(base_url,'collapse',288)
#collapse_pulls.to_csv('collapse_pull.csv',index=False)

In [8]:
df_fut = pd.read_csv('futurology_pull.csv')
df_col = pd.read_csv('collapse_pull.csv')

### Checking for duplicates

In [9]:
df_fut = df_fut.drop_duplicates()
df_col = df_col.drop_duplicates()

In [11]:
df_fut_cut = df_fut[['subreddit','title']]
df_col_cut = df_col[['subreddit','title']]

### Checking to see if any rows have no English text

### Combining Dataframes

In [13]:
df_comb = pd.concat([df_col_cut,df_fut_cut])

In [14]:
df_comb.shape

(57617, 2)

### Shuffling and Splitting Dataset into Holdout

In [15]:
df_comb = df_comb.sample(frac=1,random_state=2)

In [16]:
df_holdout = df_comb[0:11523]

In [17]:
df_main = df_comb[11523:]

In [18]:
df_holdout.shape

(11523, 2)

In [19]:
df_main.shape

(46094, 2)

Using 30% of the dataset for initial analysis.

In [20]:
df_sample = df_main.sample(frac=0.3,random_state=2)

In [21]:
df_sample['subreddit'].value_counts(normalize=True)

Futurology    0.500434
collapse      0.499566
Name: subreddit, dtype: float64

### TTS

In [22]:
X = df_sample['title']
y = df_sample['subreddit']

In [23]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=2,stratify=y)

### Baseline Null

In [24]:
dc = DummyClassifier()
dc.fit(X_train,y_train)
dc.predict(X_test)
dc.score(X_test,y_test)

0.5004339022273647

### Tokenize/Lemmatize

In [25]:
def lemma_tokenizer(doc):
    wnl = WordNetLemmatizer()
    return [wnl.lemmatize(w) for w in word_tokenize(doc)]

### Creating Own List of Stopwords

In [26]:
my_stop_words=['english','ascii','?',',','.','(',')',':','!','u','\'','i','are','that','the','on','and','is','to','a','of',\
               'in','what','for','will','you','it','be','this','with','do','n\'t','-','\'s','\'','an',\
               '`','.',',','u','/r/futurology','/r/collapse','~','e','g','’','``','=','o','de','r/collapse','r/futurology',\
               'we','how','about','have','can','if','all','or','would','by','our','why','not','your','year','when',\
               'could','there','like','from','...','an','ha','haha','hahaha','[]','[',']','so','new','now','doe',\
               'my','more','just','any','but','who','at','some','me',';','they','no','yes','out','should','get',\
               'most','s','than','up','one','wa','make','here','thing','#','$','got','&','*','where','anyone','collapse'\
              'futurology','collapse','future']

### Count Vectorize

In [27]:
cvec = CountVectorizer(strip_accents='unicode',stop_words=my_stop_words,ngram_range=(1,2),tokenizer=lemma_tokenizer)
cvec.fit(X_train)
X_train_cv = cvec.transform(X_train)
X_test_cv = cvec.transform(X_test)

X_train_cv = pd.DataFrame(X_train_cv.A,columns=cvec.get_feature_names_out())
X_test_cv = pd.DataFrame(X_test_cv.A,columns=cvec.get_feature_names_out())



### TF-IDF Vectorize

In [79]:
tfi = TfidfVectorizer(strip_accents='unicode',stop_words=my_stop_words,ngram_range=(1,2),tokenizer=lemma_tokenizer)
tfi.fit(X_train)
X_train_tfi = tfi.transform(X_train)
X_test_tfi = tfi.transform(X_test)

X_train_tfi = pd.DataFrame(X_train_tfi.A,columns=tfi.get_feature_names_out())
X_test_tfi = pd.DataFrame(X_test_tfi.A,columns=tfi.get_feature_names_out())

### Label Encoding Y

In [29]:
le = LabelEncoder()
y_train_le = le.fit_transform(y_train)
y_test_le = le.transform(y_test)

### Reviewing Most Frequent Features

In [30]:
X_train_cv.sum().sort_values(ascending=False).head(25)

think             445
world             425
people            393
as                372
climate           365
us                324
change            314
years             290
ai                266
human             260
has               228
going             227
climate change    224
technology        215
next              196
post              177
see               176
time              176
life              170
does              160
society           153
global            151
good              150
their             148
help              146
dtype: int64

### Naive Bayes

Running Multinomial Naive Bayes on lemmatized, countvectorized data. 

In [31]:
pipenb = Pipeline(
    [
    ('cv',CountVectorizer(stop_words=my_stop_words,tokenizer=lemma_tokenizer,strip_accents='unicode')),
    ('nb',MultinomialNB())   
    ])

pipenb.fit(X_train,y_train_le)
pipenb.score(X_train,y_train_le),pipenb.score(X_test,y_test_le)

(0.8980811879278758, 0.7983800983511715)

Removed lemmatizer, added bigrams, and max-df tuned.

In [115]:
pipenb1 = Pipeline(
    [
    ('cv',CountVectorizer(stop_words=my_stop_words,strip_accents='unicode',ngram_range=(1, 2),max_df=.4)),
    ('nb',MultinomialNB())   
    ])

pipenb1.fit(X_train,y_train_le)
pipenb1.score(X_train,y_train_le),pipenb1.score(X_test,y_test_le)

(0.9687590396297368, 0.7989586346543246)

Added back in lemmatizer which improved model.

In [126]:
pipenb1 = Pipeline(
    [
    ('cv',CountVectorizer(stop_words=my_stop_words,strip_accents='unicode',ngram_range=(1, 3),max_df=.5,tokenizer=lemma_tokenizer)),
    ('nb',MultinomialNB())   
    ])

pipenb1.fit(X_train,y_train_le)
pipenb1.score(X_train,y_train_le),pipenb1.score(X_test,y_test_le)

(0.9776299296114165, 0.8030083887763957)

Scoring The TF-IDF vectorized data on Gaussian Naive Bayes. Wihtout lemmatier. Uses bigrams. Performs worse than multinomial naive bayes using count vectorization.

In [99]:
gnb = GaussianNB()

gnb.fit(X_train_tfi,y_train_le)
gnb.score(X_train_tfi,y_train_le),gnb.score(X_test_tfi,y_test_le)

(0.9872722013306335, 0.695111368238357)

### Logistic

Running fairly basic Logistic Regression. CV data. Performs worse than Naive Bayes.

In [36]:
pipe = Pipeline(
    [
    ('cv',CountVectorizer(stop_words=my_stop_words,tokenizer=lemma_tokenizer,strip_accents='unicode')),
    ('lr',LogisticRegression(max_iter=10000))   
    ])

pipe.fit(X_train,y_train_le)
pipe.score(X_train,y_train_le),pipe.score(X_test,y_test_le)

(0.9299971073184843, 0.7836274226207695)

Removing lemmatizer appears to make model slightly worse.

In [37]:
pipe2 = Pipeline(
    [
    ('cv',CountVectorizer(stop_words=my_stop_words,strip_accents='unicode')),
    ('lr',LogisticRegression(max_iter=10000))   
    ])

pipe2.fit(X_train,y_train_le)
pipe2.score(X_train,y_train_le),pipe2.score(X_test,y_test_le)

(0.9366502748047439, 0.7798669366502748)

Running gridsearch over model's parameters.

In [38]:
pipe2_params = {
    'cv__ngram_range' : [(1,1), (1,2), (2,2)],
    'cv__min_df': [1,2,3],
    'cv__max_df': [0.7,0.8,0.9,1.0],
    'lr__C': [.001,.001,.01,.1,1,10,100,1000]
}

In [39]:
gs2 = GridSearchCV(pipe2,param_grid=pipe2_params)

In [40]:
# gs2.fit(X_train,y_train)
# gs2.score(X_train,y_train), gs2.score(X_test,y_test)

Running the above gridsearch yielded the below parameters.

In [41]:
#gs2.best_params_
#{'cv__max_df': 0.7, 'cv__min_df': 1, 'cv__ngram_range': (1, 2), 'lr__C': 1}

Running a model with these parameters in pipe2alpha. This is our best logistic regression model yet.

In [43]:
pipe2alpha = Pipeline(
    [
    ('cv',CountVectorizer(stop_words=my_stop_words,strip_accents='unicode',max_df=0.7,ngram_range=(1, 2),tokenizer=lemma_tokenizer)),
    ('lr',LogisticRegression(max_iter=10000))   
    ])

pipe2alpha.fit(X_train,y_train_le)
pipe2alpha.score(X_train,y_train_le),pipe2alpha.score(X_test,y_test_le)

(0.9847652106836371, 0.7908591264101822)

Running a similar model, with TF-IDF vectorizing. Slightly outperforms count vercorized. The below model is the best of the logistic models we generate.

In [110]:
pipe3 = Pipeline(
    [
    ('tf',TfidfVectorizer(stop_words=my_stop_words,ngram_range=[1,2],max_df=0.5,tokenizer=lemma_tokenizer)),
    ('lr',LogisticRegression(max_iter=10000))   
    ])

pipe3.fit(X_train,y_train_le)
pipe3.score(X_train,y_train_le),pipe3.score(X_test,y_test_le)

(0.946292546523961, 0.7983800983511715)

Assembling gridsearch to check over max_df and ngram parameters for TF-IDF.

In [96]:
pipe3_params = {
    'tf__max_df': [.5, .6, .7],
    'tf__ngram_range': [[1,1], [1,2], [2,2]]
}

In [97]:
gs3 = GridSearchCV(pipe3,param_grid=pipe3_params)

In [98]:
gs3.fit(X_train,y_train)
gs3.score(X_train,y_train), gs3.score(X_test,y_test)

(0.8961527335840324, 0.7943303442291004)

In [91]:
gs3.best_params_
#{'tf__max_df': 0.5, 'tf__ngram_range': (1, 1)}

{'tf__max_df': 0.5, 'tf__ngram_range': (1, 1)}

In [49]:
pipe3alpha = Pipeline(
    [
    ('tf',TfidfVectorizer(stop_words=my_stop_words,ngram_range=(1,1),max_df=0.5,tokenizer=lemma_tokenizer)),
    ('lr',LogisticRegression(max_iter=10000))   
    ])

pipe3alpha.fit(X_train,y_train_le)
pipe3alpha.score(X_train,y_train_le),pipe3alpha.score(X_test,y_test_le)

(0.8961527335840324, 0.7943303442291004)

### KNN Model

KNN performs considerably worse than any models we have run thus far. The best model count vectorizes, uses 3 neighbors.

In [50]:
pipe4 = Pipeline(
    [
    ('cv',CountVectorizer(stop_words=my_stop_words,ngram_range=[1,2],max_df=0.5)),
    ('knc', KNeighborsClassifier())
    ])

pipe4.fit(X_train,y_train_le)
pipe4.score(X_train,y_train_le),pipe4.score(X_test,y_test_le)

(0.7382123228232571, 0.5822967891235175)

In [51]:
pipe5 = Pipeline(
    [
    ('tf',TfidfVectorizer(stop_words=my_stop_words,ngram_range=[1,1],max_df=0.7)),
    ('knc', KNeighborsClassifier())
    ])

pipe5.fit(X_train,y_train_le)
pipe5.score(X_train,y_train_le),pipe5.score(X_test,y_test_le)

(0.7010895767042715, 0.5533699739658664)

In [52]:
pipe4_params = {
    'knc__n_neighbors': [3, 5, 7, 9],
    'knc__weights' : ['uniform','distance']
}

In [53]:
# gs4 = GridSearchCV(pipe4,param_grid=pipe4_params)
# gs4.fit(X_train,y_train_le)
# gs4.score(X_train,y_train_le),gs4.score(X_test,y_test_le)

(0.9971073184842348, 0.620480185131617)

In [54]:
gs4.best_params_
#{'knc__n_neighbors': 3, 'knc__weights': 'distance'}

{'knc__n_neighbors': 3, 'knc__weights': 'distance'}

### Decision Tree Classifier

The decision tree performs worse than the KNN Classifier.

In [55]:
pipe6 = Pipeline(
    [
    ('cv',CountVectorizer(stop_words=my_stop_words,ngram_range=[1,2],max_df=0.5)),
    ('dtc', DecisionTreeClassifier(random_state=2,max_depth=3))
    ])

pipe6.fit(X_train,y_train_le)
pipe6.score(X_train,y_train_le),pipe6.score(X_test,y_test_le)

(0.5389065663870408, 0.5429563205091119)

In [56]:
pipe6_params = {
    'dtc__max_depth': range(1,11)
}

In [57]:
gs6 = GridSearchCV(pipe6,param_grid=pipe6_params)

In [58]:
# gs6.fit(X_train,y_train_le)
# gs6.score(X_train,y_train_le),gs6.score(X_test,y_test_le)

(0.5905891428020441, 0.5828753254266705)

In [59]:
#gs6.best_params_
#{'dtc__max_depth': 10}

{'dtc__max_depth': 10}

In [60]:
pipe7 = Pipeline(
    [
    ('cv',CountVectorizer(stop_words=my_stop_words,ngram_range=[1,2],max_df=0.7)),
    ('dtc', DecisionTreeClassifier(random_state=2,max_depth=10))
    ])

pipe7.fit(X_train,y_train_le)
pipe7.score(X_train,y_train_le),pipe7.score(X_test,y_test_le)

(0.5905891428020441, 0.5828753254266705)

In [61]:
pipe8 = Pipeline(
    [
    ('tf',TfidfVectorizer(stop_words=my_stop_words,ngram_range=[1,2],max_df=0.7)),
    ('dtc', DecisionTreeClassifier(random_state=2,max_depth=10))
    ])

pipe8.fit(X_train,y_train_le)
pipe8.score(X_train,y_train_le),pipe8.score(X_test,y_test_le)

(0.5903962973676599, 0.5817182528203645)

### Random Forest

The count-vectorized random Forest performs a bit worse than our logistic and naive bayes initially.

In [62]:
pipe9 = Pipeline(
    [
    ('cv',CountVectorizer(stop_words=my_stop_words,ngram_range=[1,2],max_df=0.7)),
    ('rfc', RandomForestClassifier(random_state=2))
    ])

pipe9.fit(X_train,y_train_le)
pipe9.score(X_train,y_train_le),pipe9.score(X_test,y_test_le)

(0.9971073184842348, 0.759039629736766)

Set up gridsearch to explore the parameters a bit more.

In [63]:
pipe9_params = {
    'rfc__max_depth': [None,5,7,10],
    'rfc__n_estimators' : [150,200,250,300],
    'rfc__min_samples_split': [2,3,4],
    'cv__ngram_range': [(1,2), (1,1)]
}

In [64]:
gs9 = GridSearchCV(pipe9,param_grid=pipe9_params)

The gridsearch random forest model generated a model that was better than the previous random forest, but not as strong as the previously seen Naive Bayes or Logistic models.

In [101]:
# gs9.fit(X_train,y_train_le)
# gs9.score(X_train,y_train_le),gs9.score(X_test,y_test_le)

In [None]:
gs9.best_params_

Running models for random forest using the TF-IDF vectorization.

In [67]:
pipe10 = Pipeline(
    [
    ('tf',TfidfVectorizer(stop_words=my_stop_words,ngram_range=[1,1],max_df=0.5,tokenizer=lemma_tokenizer)),
    ('rfc', RandomForestClassifier(random_state=2,min_samples_split=4,n_estimators=250))
    ])

pipe10.fit(X_train,y_train_le)
pipe10.score(X_train,y_train_le),pipe10.score(X_test,y_test_le)

(0.9949860187060071, 0.7671391379809083)

In [68]:
pipe10_params = {
    'rfc__max_depth': [None,10],
    'rfc__n_estimators' : [300,350,400],
    'rfc__min_samples_split': [3,4,5],
}

In [69]:
gs10 = GridSearchCV(pipe10,param_grid=pipe10_params)

A TF-IFD vectorized Random Forest with 5 minimum sample splits and 400 estimators generated the best Random Forest model we've seen.

In [70]:
# gs10.fit(X_train,y_train_le)
# gs10.score(X_train,y_train_le),gs10.score(X_test,y_test_le)

In [71]:
gs10.best_params_

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

In [103]:
pipe11 = Pipeline(
    [
    ('tf',TfidfVectorizer(stop_words=my_stop_words,ngram_range=[1,2],max_df=0.5,tokenizer=lemma_tokenizer)),
    ('rfc', RandomForestClassifier(random_state=2,min_samples_split=5,n_estimators=450))
    ])

pipe11.fit(X_train,y_train_le)
pipe11.score(X_train,y_train_le),pipe11.score(X_test,y_test_le)

(0.9946967505544306, 0.7656927972230257)

### Ensemble Analysis

I will now contruct a series of ensemble models that will combine the best models that my previous analysis produced.

In [73]:
vr1 = VotingClassifier(
    [('lr',LogisticRegression(max_iter=10000)),
     ('mnb', MultinomialNB()),
     ('rfc', RandomForestClassifier(random_state=2,min_samples_split=4,n_estimators=250)),
      ],
    n_jobs=-1
)

vr1.fit(X_train_cv,y_train_le)
vr1.score(X_train_cv,y_train_le),vr1.score(X_test_cv,y_test_le)

(0.9874650467650178, 0.7876771767428407)

In [74]:
vr2 = VotingClassifier(
    [('lr',LogisticRegression(max_iter=10000)),
     ('gnb', GaussianNB()),
     ('rfc', RandomForestClassifier(random_state=2,min_samples_split=5,n_estimators=400)),
      ],
     n_jobs=-1)

vr2.fit(X_train_tfi,y_train_le)
vr2.score(X_train_tfi,y_train_le),vr2.score(X_test_tfi,y_test_le)

(0.9939253688168933, 0.7882557130459936)

In [127]:
# vr3 = VotingClassifier(
#     [('lr',LogisticRegression(max_iter=10000)),
#      ('gnb', GaussianNB()),
#      ('rfc', RandomForestClassifier(random_state=2,min_samples_split=5,n_estimators=400)),
#       ],
#      n_jobs=-1,
#     weights = [.3,.5,.2]
# )

# vr3.fit(X_train_tfi,y_train_le)
# vr3.score(X_train_tfi,y_train_le),vr3.score(X_test_tfi,y_test_le)

### Analysis of Top Model

In [138]:
nb_cv = Pipeline(
    [
    ('cv',CountVectorizer(stop_words=my_stop_words,strip_accents='unicode',ngram_range=(1, 3),max_df=.5,tokenizer=lemma_tokenizer)),
    ('nb',MultinomialNB())   
    ])

nb_cv.fit(X_train,y_train_le)
nb_cv.score(X_train,y_train_le),nb_cv.score(X_test,y_test_le)

(0.9776299296114165, 0.8030083887763957)

In [134]:
nbcv_params = {
    'cv__ngram_range': [(1,1),(1,2),(1,3),(2,3)],
    'cv__max_df' : [0.4,0.5,0.6,0.9],
}

In [135]:
nb_cv_gs = GridSearchCV(nb_cv,param_grid=nbcv_params)

In [136]:
nb_cv_gs.fit(X_train,y_train_le)
nb_cv_gs.score(X_train,y_train_le),nb_cv_gs.score(X_test,y_test_le)

(0.9684697714781603, 0.8021405843216662)

In [137]:
nb_cv_gs.best_params_

{'cv__max_df': 0.4, 'cv__ngram_range': (1, 2)}

In [139]:
nb_cv2 = Pipeline(
    [
    ('cv',CountVectorizer(stop_words=my_stop_words,strip_accents='unicode',ngram_range=(1, 2),max_df=.4,tokenizer=lemma_tokenizer)),
    ('nb',MultinomialNB())   
    ])

pipenb1.fit(X_train,y_train_le)
pipenb1.score(X_train,y_train_le),pipenb1.score(X_test,y_test_le)

(0.9776299296114165, 0.8030083887763957)

In [143]:
nb_cv2.named_steps['nb'].coefs_

AttributeError: 'MultinomialNB' object has no attribute 'coefs_'

In [None]:
# coef_df = pd.DataFrame(lr.coef_[0],index=cvec.get_feature_names_out(),columns=['coef_val'])

# coef_df.sort_values(by='coef_val').tail(50)

# coef_df.sort_values(by='coef_val').head(50)

# params = {
#     'max_depth' : range(2,9)
# }

# gs = GridSearchCV(DecisionTreeClassifier(),params)
# gs.fit(X_train_cvdf,y_train_le)
# gs.score(X_train_cvdf,y_train_le), gs.score(X_test_cvdf,y_test_le)

# gs.best_estimator_

# gs.best_estimator_.feature_importances_

# gs_df = pd.DataFrame(gs.best_estimator_.feature_importances_,index=X_test_cvdf.columns,columns=['feature imps'])

# gs_df.sort_values(by='feature imps',ascending=False).head(50)



# balanced_accuracy_score(y_test_encoded,dc.predict(X_test))







# dtc.feature_importances_

# ConfusionMatrixDisplay.from_estimator(dtc,X_test_cvdf,y_test_le)



