In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [3]:
# importing data from a public github repo containing reddit posts from 50 subreddits
base_url = 'https://raw.githubusercontent.com/linanqiu/reddit-dataset/master/'

file_names = [
    'entertainment_anime.csv',
    'entertainment_comicbooks.csv',
    'entertainment_harrypotter.csv',
    'entertainment_movies.csv',
    'entertainment_music.csv',
    'entertainment_starwars.csv',
    'gaming_dota2.csv',
    'gaming_gaming.csv',
    'gaming_leagueoflegends.csv',
    'gaming_minecraft.csv',
    'gaming_pokemon.csv',
    'gaming_skyrim.csv',
    'gaming_starcraft.csv',
    'gaming_tf2.csv',
    'humor_adviceanimals.csv',
    'humor_circlejerk.csv',
    'humor_facepalm.csv',
    'humor_funny.csv',
    'humor_imgoingtohellforthis.csv',
    'humor_jokes.csv',
    'learning_askhistorians.csv',
    'learning_askscience.csv',
    'learning_explainlikeimfive.csv',
    'learning_science.csv',
    'learning_space.csv',
    'learning_todayilearned.csv',
    'learning_youshouldknow.csv',
    'lifestyle_drunk.csv',
    'lifestyle_food.csv',
    'lifestyle_frugal.csv',
    'lifestyle_guns.csv',
    'lifestyle_lifehacks.csv',
    'lifestyle_motorcycles.csv',
    'lifestyle_progresspics.csv',
    'lifestyle_sex.csv',
    'news_conservative.csv',
    'news_conspiracy.csv',
    'news_libertarian.csv',
    'news_news.csv',
    'news_offbeat.csv',
    'news_politics.csv',
    'news_truereddit.csv',
    'news_worldnews.csv',
    'television_breakingbad.csv',
    'television_community.csv',
    'television_doctorwho.csv',
    'television_gameofthrones.csv',
    'television_himym.csv',
    'television_mylittlepony.csv',
    'television_startrek.csv',
    'television_thewalkingdead.csv',
]

dataframes = []
for file_name in file_names:
    # Combine the base URL with the file name
    file_url = f'{base_url}{file_name}'
    df = pd.read_csv(file_url)
    dataframes.append(df)

combined_df = pd.concat(dataframes, ignore_index=True)
combined_df.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0,,0,1,2,3,4.0,5,6.0,7.0,8.0,9.0,10.0
1,1,0.0,because she s the worst,d02u69l,anime,entertainment,1455683054.0,Redire77,7.0,0.0,352.0,14017.0,0.0
2,2,1.0,i am referring to this http iimgurcom5sryl...,466ijy,anime,entertainment,1455682823.0,shiba_arata,0.0,0.0,1.0,20.0,0.0
3,3,2.0,cheating but zoldycks must have a great time a...,d02g879,anime,entertainment,1455661236.0,ShaKing807,6.0,0.0,1308.0,62021.0,1.0
4,4,3.0,kurosaki ichigo http images5fanpopcomimag...,d02v88z,anime,entertainment,1455684994.0,Tf2idlingftw,2.0,0.0,4156.0,1021.0,0.0


In [56]:
# restructing the dataframe to get ready for preprocessing
df = combined_df[['1','3']].astype(str)
df.columns = ['text', 'subreddit']
df = df.iloc[1:, :]

df.head()

Unnamed: 0,text,subreddit
1,because she s the worst,anime
2,i am referring to this http iimgurcom5sryl...,anime
3,cheating but zoldycks must have a great time a...,anime
4,kurosaki ichigo http images5fanpopcomimag...,anime
5,there are a shit ton of koutarous but the pre...,anime


In [57]:
# cleaning the text getting rid of special characters and numbers
def clean_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    return text

df['text'] = df['text'].apply(clean_text)

In [58]:
# removing common/insignificant english words
df['text'] = df['text'].str.lower()
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stop_words])

df['text'] = df['text'].apply(remove_stopwords)

In [59]:
# stemming and getting rid of short tokens
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

def remove_short_tokens(text):
    return " ".join([word for word in text.split() if len(word) > 2])

df['text'] = df['text'].apply(lemmatize_text)
df['text'] = df['text'].apply(remove_short_tokens)

In [60]:
# Replace empty strings with NaN
df.replace('', pd.NA, inplace=True)
df.dropna(inplace=True)
df

Unnamed: 0,text,subreddit
1,worst,anime
2,referring http iimgurcom srylmijpg deeper mean...,anime
3,cheating zoldycks must great time thanksgiving,anime
4,kurosaki ichigo http image fanpopcomimagephoto...,anime
5,shit ton koutarous presence one http smediacac...,anime
...,...,...
2663638,spm,television
2663642,evj,television
2663646,ouhb,television
2663648,stxn,television


In [61]:
# spltting data into train/test sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['subreddit'], test_size=0.2, random_state=42)

In [62]:
# converting text data to numerical data using TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=10000)
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

In [70]:
# further parameter tuning
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20, 30]
}

CV_clf = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5)
CV_clf.fit(X_train_vect, y_train)
print(CV_clf.best_params_)


{'max_depth': 20, 'n_estimators': 200}


In [63]:
# training a random forest classifier
clf = RandomForestClassifier(n_estimators=100, max_depth=20, random_state=42)
clf.fit(X_train_vect, y_train)

In [64]:
# making predictions based on classifier
y_pred = clf.predict(X_test_vect)

In [69]:
# getting accuracy of my classifier
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.89456268147961113


In [71]:
# function for applying the classifier to new text posts
def predict_subreddit(text):
    text = clean_text(text)  # Clean the new text
    text_vect = vectorizer.transform([text])  # Vectorize the new text
    prediction = clf.predict(text_vect)  # Predict the subreddit
    return prediction[0]

In [72]:

print(predict_subreddit("Here's a new Reddit post text to classify"))

learning
