In [1]:
import requests
import time
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import ConfusionMatrixDisplay
import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import LabelEncoder

In [2]:
base_url = 'https://api.pushshift.io/reddit/search/'

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
def pull_posts(base_url,subreddit,length):
    data_pulls = pd.DataFrame()
    before = 1650821963
    for i in range(length):
        parameters = {'subreddit': subreddit,
                  'size': 100,
                  'is_self' : True,
                  'before': before}
        try:
            response = requests.get(base_url+'submission/',parameters)
            if response.status_code != 200:
                print('request failed')
            else:
                data = response.json()
                posts = data['data']
                posts = pd.DataFrame(posts)
                data_pulls = pd.concat([data_pulls,posts])
                time.sleep(5)
                before = data_pulls['created_utc'].iloc[-1]
        except:
            print("Exception: {}".format(type(exception).__name__))
            print("Exception message: {}".format(exception))
            time.sleep(15)
    return data_pulls

In [5]:
#futurology_pulls = pull_posts(base_url,'futurology',289)
#futurology_pulls.to_csv('futurology_pull.csv',index=False)

In [6]:
#collapse_pulls = pull_posts(base_url,'collapse',288)
#collapse_pulls.to_csv('collapse_pull.csv',index=False)

In [7]:
df_fut = pd.read_csv('futurology_pull.csv')
df_col = pd.read_csv('collapse_pull.csv')

  df_fut = pd.read_csv('futurology_pull.csv')
  df_col = pd.read_csv('collapse_pull.csv')


### Combining Dataframes

In [8]:
df_fut_cut = df_fut[['subreddit','title']]
df_col_cut = df_col[['subreddit','title']]

In [9]:
df_comb = pd.concat([df_col_cut,df_fut_cut])

In [10]:
df_comb.shape

(57617, 2)

### TTS

In [11]:
X = df_comb['title']
y = df_comb['subreddit']

In [12]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42,stratify=y)

### Baseline Null

In [13]:
dc = DummyClassifier()
dc.fit(X_train,y_train)
dc.predict(X_test)
dc.score(X_test,y_test)

0.5011454356126345

### Initial LogReg Model Scores

In [14]:
def lemma_tokenizer(doc):
    wnl = WordNetLemmatizer()
    return [wnl.lemmatize(w) for w in word_tokenize(doc)]

In [16]:
pipe = Pipeline(
    [
    ('cv',CountVectorizer(stop_words='english',ngram_range=[1,2],min_df=3,tokenizer=lemma_tokenizer)), 
    ('lr',LogisticRegression(max_iter=10000))   
    ])

In [17]:
pipe.fit(X_train,y_train)
pipe.score(X_train,y_train), pipe.score(X_test,y_test)



(0.9234471905952051, 0.8469281499479347)

In [21]:
pipe2 = Pipeline(
    [
    ('tf',TfidfVectorizer(stop_words='english',ngram_range=[1,2],min_df=3,tokenizer=lemma_tokenizer)),
    ('lr',LogisticRegression(max_iter=10000))   
    ])

In [22]:
pipe2.fit(X_train,y_train)
pipe2.score(X_train,y_train), pipe.score(X_test,y_test)



(0.8988938257891327, 0.8469281499479347)

### Deeper Dive

In [62]:
my_stopwords = ['?',',','.','(',')',':','!','u','\'','i','are','that','the','on','and','is','to','a','of',\
               'in','what','for','will','you','it','be','this','with','do','n\'t','-','\'s','\'']

In [63]:
cvec = CountVectorizer(stop_words=my_stopwords,ngram_range=[1,2],min_df=3,tokenizer=lemma_tokenizer)
cvec.fit(X_train)
X_train_cv = cvec.transform(X_train)
X_test_cv = cvec.transform(X_test)

X_train_cvdf = pd.DataFrame(X_train_cv.A,columns=cvec.get_feature_names_out())
X_test_cvdf = pd.DataFrame(X_test_cv.A,columns=cvec.get_feature_names_out())



In [64]:
X_train_cvdf.sum().sort_values(ascending=False).head(25)

collapse    4991
we          4403
how         3743
future      3551
about       2299
have        2241
or          2064
can         2008
would       1941
if          1934
``          1869
think       1845
''          1829
world       1780
from        1740
year        1651
your        1597
people      1589
an          1564
why         1530
climate     1514
not         1474
’           1469
our         1467
by          1456
dtype: int64

In [48]:
# lr = LogisticRegression(max_iter=10_000)
# lr.fit(X_train_cvdf,y_train)
# lr.score(X_train_cvdf,y_train),lr.score(X_test_cvdf,y_test)

In [None]:
#ConfusionMatrixDisplay.from_estimator(lr,X_test_cvdf,y_test,cmap='Blues')

In [None]:
coef_df = pd.DataFrame(lr.coef_[0],index=cvec.get_feature_names_out(),columns=['coef_val'])

### What should we be attempting to optimize for?
