In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import os
import re
random.seed(42)

In [23]:
import kagglehub


# Download latest version
path = kagglehub.dataset_download("prakharrathi25/reddit-data-huge")

In [24]:
# List all CSV files in the folder
csv_files = [f for f in os.listdir(path) if f.endswith('.csv')]

# Initialize a list to hold DataFrames
dataframes = []

# Read each CSV file into a DataFrame and add it to the list
for csv_file in csv_files:
    file_path = os.path.join(path, csv_file)
    df = pd.read_csv(file_path)
    dataframes.append(df)

# Optionally, combine all DataFrames into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)

In [25]:
df= combined_df.copy()

#Dropping unnecessary columns
drop_columns = ['Unnamed: 0', 'ID', 'Flair', 'is_Original', 'is_original', 'URL','creation_date','Text','Sentiment', 'Comments']
df.drop(drop_columns, axis = 1, inplace = True)

In [26]:
#Dropping rows with no title and body
df = df.drop(df[(df['Title'].isna()) & (df['Body'].isna())].index)

In [27]:
#Dropping subreddits with insufficient data
df = df.drop(df[(df['Subreddit'] == 'youthshouldknow') | (df['Subreddit'] == 'YouthandGovernment')].index)

In [28]:
df['Subreddit'].value_counts()

Subreddit
jobs                 2495
college              1769
Anxiety              1000
NeutralPolitics      1000
careerguidance       1000
depression_help      1000
Feminism             1000
Colombia             1000
Entrepreneur          999
highschool            998
mexico                998
whatsbotheringyou     996
ApplyingToCollege     995
engineering           994
dating                993
COVID19               992
science               991
povertyfinance        990
business              989
studentaffairs        986
SuicideWatch          986
computerscience       984
astrology             983
teenagers             957
YouthRights           934
AdviceForTeens        818
GradSchool            769
kidsrights            471
Name: count, dtype: int64

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 29087 entries, 0 to 37533
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   num_comments  29087 non-null  float64
 1   Title         29087 non-null  object 
 2   Subreddit     29087 non-null  object 
 3   Body          16360 non-null  object 
 4   Upvotes       29087 non-null  int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 1.3+ MB


### Splitting Dataset

In [29]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(df, test_size = 0.2, random_state = 42)

train = train_set.drop('Subreddit', axis = 1)
train_labels = train_set['Subreddit'].copy()

test = test_set.drop('Subreddit', axis = 1)
test_labels = test_set['Subreddit'].copy()

### Pipeline

In [14]:
import spacy
nlp = spacy.load("en_core_web_lg")

In [36]:
#Encode Target variables
from sklearn.preprocessing import LabelEncoder

#Initialize
label_encoder = LabelEncoder()

#Encode train and testing labels
train_labels= label_encoder.fit_transform(train_labels)
test_labels= label_encoder.transform(test_labels)

In [37]:
# Data Cleaning and Preprocessing
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


class TokenizerVectorizer(BaseEstimator, TransformerMixin):
    '''
    Custom transformer to tokenize and vectorize text data 
    '''

    def __init__(self,columns =['Title', 'Body']):
        self.columns= columns

    def fit(self, X, y=None):
       
        return self

    def transform(self, X):
        
        def preprocess_text(text):
            #Remove newline characters, and in string format
            
            text = str(text)
    
            text = text.replace("\n", " ")
        
            return text
    
    
        def vectorizer(doc):
        
            # select lowercase tokens that are not special characters or stop words
            tokens = [token.text.lower() for token in doc if not token.is_stop and token.text.isalpha()]
    
            #Getting the vectors for the tokens that are not stop words or contain punctuation
            vectors = [token.vector for token in doc if token.text.lower() in tokens]
    
            #Return the average vector (shape 300) for all the tokens in this particular instance
            if len(tokens) > 0:

                #Return average vector if contains tokens
                return np.mean(vectors, axis = 0) 

            else:

                #Return empty vector if no tokens
                return np.zeros(300,)

        #Copy of dataframe to make changes
        X = X.copy()
    
        #Apply functions to title and body columns, and vectorize the text
        X['Title'] = X['Title'].apply(preprocess_text)
        X['Body']= X['Body'].apply(preprocess_text)
        X['Vectors'] = [vectorizer(doc) for doc in nlp.pipe((X['Title'] + " " + X['Body'].fillna(" ")),disable=['ner','parser'])]

        #Put all columns into a 2-d array for classifier
        numerical = X[['num_comments','Upvotes']].values
        vectors = np.stack(X['Vectors'].values)
        X= np.hstack([numerical,vectors])

        return X

pipeline = Pipeline([
    ('tokenize_vectorize',TokenizerVectorizer(columns=['Title','Body'])), #Process dataframe
    ('rf_clf',RandomForestClassifier()) #Classifier
])

### Grid Search (Takes a few hours!)

In [17]:
import time

In [18]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score


param_grid = {
    'rf_clf__n_estimators': [100,150],  # Number of trees in the forest
    'rf_clf__max_depth': [20, 30],  # Max depth of the tree
    'rf_clf__class_weight':['balanced']
}


#


grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring= 'accuracy', verbose = 2 )

start = time.time()
grid_search.fit(train, train_labels)
end = time.time()

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] END rf_clf__class_weight=balanced, rf_clf__max_depth=20, rf_clf__n_estimators=100; total time=10.2min
[CV] END rf_clf__class_weight=balanced, rf_clf__max_depth=20, rf_clf__n_estimators=100; total time= 9.5min
[CV] END rf_clf__class_weight=balanced, rf_clf__max_depth=20, rf_clf__n_estimators=100; total time= 9.8min
[CV] END rf_clf__class_weight=balanced, rf_clf__max_depth=20, rf_clf__n_estimators=150; total time=10.0min
[CV] END rf_clf__class_weight=balanced, rf_clf__max_depth=20, rf_clf__n_estimators=150; total time=10.4min
[CV] END rf_clf__class_weight=balanced, rf_clf__max_depth=20, rf_clf__n_estimators=150; total time=10.0min
[CV] END rf_clf__class_weight=balanced, rf_clf__max_depth=30, rf_clf__n_estimators=100; total time=10.0min
[CV] END rf_clf__class_weight=balanced, rf_clf__max_depth=30, rf_clf__n_estimators=100; total time= 9.3min
[CV] END rf_clf__class_weight=balanced, rf_clf__max_depth=30, rf_clf__n_estimators=1

In [19]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
        print(mean_score,params)

0.7284798788381903 {'rf_clf__class_weight': 'balanced', 'rf_clf__max_depth': 20, 'rf_clf__n_estimators': 100}
0.7395249637159704 {'rf_clf__class_weight': 'balanced', 'rf_clf__max_depth': 20, 'rf_clf__n_estimators': 150}
0.7335082772177205 {'rf_clf__class_weight': 'balanced', 'rf_clf__max_depth': 30, 'rf_clf__n_estimators': 100}
0.7411150916852532 {'rf_clf__class_weight': 'balanced', 'rf_clf__max_depth': 30, 'rf_clf__n_estimators': 150}


In [20]:
elapsed = end - start
elapsed/60/60

2.1711674133274292

### Accuracy on Test Data

In [21]:
# Evaluate best model
best_model = grid_search.best_estimator_
best_model

In [22]:
accuracy = best_model.score(test, test_labels)
print(f"Best Model Accuracy: {accuracy}")

Best Model Accuracy: 0.7586799587487109


In [39]:
from sklearn.metrics import classification_report

y_pred= best_model.predict(test)

print(classification_report(test_labels, y_pred, target_names = label_encoder.classes_))

                   precision    recall  f1-score   support

   AdviceForTeens       0.85      0.79      0.82       161
          Anxiety       0.70      0.63      0.67       218
ApplyingToCollege       0.81      0.59      0.68       183
          COVID19       0.87      0.92      0.89       202
         Colombia       0.90      0.78      0.84       209
     Entrepreneur       0.81      0.81      0.81       190
         Feminism       0.67      0.68      0.68       199
       GradSchool       0.72      0.39      0.51       159
  NeutralPolitics       0.85      0.95      0.89       207
     SuicideWatch       0.65      0.75      0.69       173
      YouthRights       0.57      0.77      0.66       188
        astrology       0.78      0.75      0.77       191
         business       0.69      0.82      0.75       201
   careerguidance       0.82      0.70      0.75       191
          college       0.79      0.92      0.85       373
  computerscience       0.81      0.71      0.75       

### Accuracy on train data

In [41]:
best_model.score(train, train_labels)

1.0