In [0]:
# import packages 
import requests
import time
import nltk
import pandas as pd
import regex as re
import numpy as np
import joblib

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.corpus import wordnet

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [0]:
df =pd.read_csv('/content/model.csv')

In [0]:
df = df.sample(frac=1).reset_index(drop=True)

In [0]:

df.drop('Unnamed: 0',axis=1,inplace=True)

In [0]:

df.head()

In [0]:
df.dropna()

In [0]:
df["content"] = df["title"].astype(str) + df["post_paragraph"].astype(str)

In [0]:
df.drop(['title', 'post_paragraph'], axis=1,inplace=True)

In [0]:
df.columns = ['subreddit', 'content']

In [13]:
X = df['content']
y = df['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(8710,)
(8710,)
(3733,)
(3733,)


## Random Forest

In [0]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('classifier', RandomForestClassifier(n_estimators=200,random_state=42)),
])

In [0]:
pipeline.fit(X_train,y_train)

In [0]:
predictions = pipeline.predict(X_test)
print(classification_report(y_test,predictions))

In [0]:
# Function to get predictions 
def get_predictions(post, num_answers=5):

  """ 
  arg:
    takes a potential post
  returns:
    top options of suitable subreddits

  """

  preds = pd.Series(pipeline.predict_proba(post)[0])

  preds.index = pipeline.classes_

  preds = preds.sort_values(ascending=False)

  return preds[:5]

In [0]:
post = ["Lebron is the best in this game"]

In [0]:
get_predictions(post)

nba               0.365
gaming            0.230
Unexpected        0.160
Showerthoughts    0.035
AskReddit         0.030
dtype: float64

In [0]:
post = [ """
              This election, Some candidates have come forward to debate environmental policy issues and campaign regulations.
                """]

In [0]:
get_predictions(post)

copypasta    0.360
DnD          0.135
teenagers    0.070
politics     0.055
Piracy       0.055
dtype: float64

## KNeighbors

In [0]:
# Not the best performer 

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('classifier', KNeighborsClassifier(n_neighbors=10)),
])

In [0]:
pipeline.fit(X_train,y_train)

In [0]:
predictions = pipeline.predict(X_test)
print(classification_report(y_test,predictions))

## Nearest Neigbors w/ Ball Tree

In [0]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('classifier', NearestNeighbors(n_neighbors=20)),
])

In [0]:
pipeline.fit(X_train,y_train)

## Random Forest is the best performing model

In [0]:
# pickle the model for transfer to Data Engineers

In [19]:
# Create pickled file with joblib
from joblib import dump, load

dump(pipeline, 'posthere_model.joblib')

['posthere_model.joblib']