In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import os
import re
random.seed(42)

In [4]:
import kagglehub


# Download latest version
path = kagglehub.dataset_download("prakharrathi25/reddit-data-huge")

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# List all CSV files in the folder
csv_files = [f for f in os.listdir(path) if f.endswith('.csv')]

# Initialize a list to hold DataFrames
dataframes = []

# Read each CSV file into a DataFrame and add it to the list
for csv_file in csv_files:
    file_path = os.path.join(path, csv_file)
    df = pd.read_csv(file_path)
    dataframes.append(df)

# Optionally, combine all DataFrames into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)

In [144]:
df= combined_df.copy()

#Dropping unnecessary columns
drop_columns = ['Unnamed: 0', 'ID', 'Flair', 'is_Original', 'is_original', 'URL','creation_date','Text','Sentiment', 'Comments']
df.drop(drop_columns, axis = 1, inplace = True)

In [185]:
#Dropping rows with no title and body
df = df.drop(df[(df['Title'].isna()) & (df['Body'].isna())].index)

In [191]:
#Dropping subreddits with insufficient data
df = df.drop(df[(df['Subreddit'] == 'youthshouldknow') | (df['Subreddit'] == 'YouthandGovernment')].index)

In [200]:
df['Subreddit'].value_counts()

Subreddit
jobs                 2495
college              1769
Anxiety              1000
NeutralPolitics      1000
careerguidance       1000
depression_help      1000
Feminism             1000
Colombia             1000
Entrepreneur          999
highschool            998
mexico                998
whatsbotheringyou     996
ApplyingToCollege     995
engineering           994
dating                993
COVID19               992
science               991
povertyfinance        990
business              989
studentaffairs        986
SuicideWatch          986
computerscience       984
astrology             983
teenagers             957
YouthRights           934
AdviceForTeens        818
GradSchool            769
kidsrights            471
Name: count, dtype: int64

### Splitting Dataset

In [207]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(df, test_size = 0.2, random_state = 42)

train = train_set.drop('Subreddit', axis = 1)
train_labels = train_set['Subreddit'].copy()

test = test_set.drop('Subreddit', axis = 1)
test_labels = test_set['Subreddit'].copy()

### Pipeline

In [208]:
import spacy
nlp = spacy.load("en_core_web_lg")

In [269]:
#Encode Target variables
from sklearn.preprocessing import LabelEncoder

#Initialize
label_encoder = LabelEncoder()

#Encode train and testing labels
train_labels= label_encoder.fit_transform(train_labels)
test_labels= label_encoder.transform(test_labels)

In [255]:
# Data Cleaning and Preprocessing
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


class TokenizerVectorizer(BaseEstimator, TransformerMixin):
    '''
    Custom transformer to tokenize and vectorize text data 
    '''

    def __init__(self,columns =['Title', 'Body']):
        self.columns= columns

    def fit(self, X, y=None):
       
        return self

    def transform(self, X):
        
        def preprocess_text(text):
            #Remove newline characters, and in string format
            
            text = str(text)
    
            text = text.replace("\n", " ")
        
            return text
    
    
        def vectorizer(doc):
        
            # select lowercase tokens that are not special characters or stop words
            tokens = [token.text.lower() for token in doc if not token.is_stop and token.text.isalpha()]
    
            #Getting the vectors for the tokens that are not stop words or contain punctuation
            vectors = [token.vector for token in doc if token.text.lower() in tokens]
    
            #Return the average vector (shape 300) for all the tokens in this particular instance
            if len(tokens) > 0:

                #Return average vector if contains tokens
                return np.mean(vectors, axis = 0) 

            else:

                #Return empty vector if no tokens
                return np.zeros(300,)

        #Copy of dataframe to make changes
        X = X.copy()
    
        #Apply functions to title and body columns, and vectorize the text
        X['Title'] = X['Title'].apply(preprocess_text)
        X['Body']= X['Body'].apply(preprocess_text)
        X['Vectors'] = [vectorizer(doc) for doc in nlp.pipe((X['Title'] + " " + X['Body'].fillna(" ")),disable=['ner','parser'])]

        #Put all columns into a 2-d array for classifier
        numerical = X[['num_comments','Upvotes']].values
        vectors = np.stack(X['Vectors'].values)
        X= np.hstack([numerical, vectors])

        return X

pipeline = Pipeline([
    ('tokenize_vectorize',TokenizerVectorizer(columns=['Title','Body'])), #Process dataframe
    ('rf_clf',RandomForestClassifier()) #Classifier
])

### Grid Search (Takes a few hours!)

In [256]:
import time

In [274]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score


param_grid = {
    'rf_clf__n_estimators': [50, 100],  # Number of trees in the forest
    'rf_clf__max_depth': [10, 20, 30, None],  # Max depth of the tree
    #'rf_clf__min_samples_split': [2, 5, 10],  # Minimum number of samples to split a node
    #'rf_clf__min_samples_leaf': [1, 2, 4],  # Minimum number of samples to be at a leaf node
}


#


grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring= 'accuracy', verbose = 2 )

start = time.time()
grid_search.fit(train, train_labels)
end = time.time()

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] END ...rf_clf__max_depth=None, rf_clf__n_estimators=100; total time=   2.7s
[CV] END ....rf_clf__max_depth=None, rf_clf__n_estimators=50; total time=   2.9s
[CV] END ...rf_clf__max_depth=None, rf_clf__n_estimators=100; total time=   1.8s
[CV] END ......rf_clf__max_depth=30, rf_clf__n_estimators=50; total time=   3.5s
[CV] END .....rf_clf__max_depth=30, rf_clf__n_estimators=100; total time=   1.6s
[CV] END ....rf_clf__max_depth=None, rf_clf__n_estimators=50; total time=   3.0s
[CV] END ...rf_clf__max_depth=None, rf_clf__n_estimators=100; total time=   1.1s
[CV] END ......rf_clf__max_depth=10, rf_clf__n_estimators=50; total time=12.5min
[CV] END ......rf_clf__max_depth=10, rf_clf__n_estimators=50; total time= 9.6min
[CV] END ......rf_clf__max_depth=10, rf_clf__n_estimators=50; total time= 9.1min
[CV] END .....rf_clf__max_depth=10, rf_clf__n_estimators=100; total time= 9.4min
[CV] END .....rf_clf__max_depth=10, rf_clf__n_est

In [280]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
        print(mean_score,params)

0.6856763146094687 {'rf_clf__max_depth': 10, 'rf_clf__n_estimators': 50}
0.7010187596339198 {'rf_clf__max_depth': 10, 'rf_clf__n_estimators': 100}
0.7057462547095993 {'rf_clf__max_depth': 20, 'rf_clf__n_estimators': 50}
0.7303280711434473 {'rf_clf__max_depth': 20, 'rf_clf__n_estimators': 100}
0.7143840854541891 {'rf_clf__max_depth': 30, 'rf_clf__n_estimators': 50}
0.7334653108188739 {'rf_clf__max_depth': 30, 'rf_clf__n_estimators': 100}
0.710645382680633 {'rf_clf__max_depth': None, 'rf_clf__n_estimators': 50}
0.7286948161014859 {'rf_clf__max_depth': None, 'rf_clf__n_estimators': 100}


In [278]:
elapsed = end - start
elapsed/60/60

3.987560488912794

### Accuracy on Test Data

In [275]:
# Evaluate best model
best_model = grid_search.best_estimator_
best_model

In [276]:
accuracy = best_model.score(test, test_labels)
print(f"Best Model Accuracy: {accuracy}")

Best Model Accuracy: 0.7552423513234788
