In [1]:
import praw
import os
import pandas as pd
import requests


import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('./reddit_combined_df.csv')

In [3]:
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    words = text.split()
    processed_words = [
    lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    
    processed_text = " ".join(processed_words)
    return processed_text

In [4]:
df['selftext'] = df['selftext'].fillna('')

In [6]:
df['text'] = (df['title'] + ' : ' + df['selftext']).apply(preprocess_text)

In [7]:
df.head()

Unnamed: 0,subreddit,title,selftext,utc,text
0,dogs,Best pet friendly couches,I am looking to replace my couches and looking...,1732064000.0,Best pet friendly couch : I looking replace co...
1,dogs,Hello beautiful dog owners!,I got offered a Pembroke puppy. She’s lower p...,1732061000.0,Hello beautiful dog owners! : I got offered Pe...
2,dogs,best dog breed for a nervous cat?,hi! hope im doing this right lol i dont use re...,1732055000.0,best dog breed nervous cat? : hi! hope im righ...
3,dogs,Can I train my dog to like soft toys without r...,I have the sweetest 5 y/o Newfie with quite a ...,1732039000.0,Can I train dog like soft toy without ripping ...
4,dogs,Megathread: Why Does My Dog Do That?,Does your dog turn his head when he hears an o...,1732036000.0,Megathread: Why Does My Dog Do That? : Does do...


In [8]:
df['subreddit'].value_counts()

subreddit
personalfinance    3916
dogs               3794
Name: count, dtype: int64

In [9]:
X = df['text']
y = df['subreddit']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [11]:
vect = CountVectorizer(ngram_range=(1, 3), stop_words='english', max_features=5000)
X_train_vec = vect.fit_transform(X_train)
X_test_vec = vect.transform(X_test)

In [12]:
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_vec, y_train)

In [13]:
lr_pred = lr_model.predict(X_test_vec)
print(classification_report(y_test, lr_pred))

                 precision    recall  f1-score   support

           dogs       1.00      1.00      1.00       759
personalfinance       1.00      1.00      1.00       783

       accuracy                           1.00      1542
      macro avg       1.00      1.00      1.00      1542
   weighted avg       1.00      1.00      1.00      1542



In [14]:
lr_model.score(X_train_vec, y_train)

1.0

In [15]:
lr_model.score(X_test_vec, y_test)

1.0

In [16]:
df['predicted_subreddit'] = df['text'].apply(lambda x: lr_model.predict(vect.transform([x]))[0])

In [17]:
df['correct'] = df['predicted_subreddit'] == df['subreddit']

In [18]:
df.head(2)

Unnamed: 0,subreddit,title,selftext,utc,text,predicted_subreddit,correct
0,dogs,Best pet friendly couches,I am looking to replace my couches and looking...,1732064000.0,Best pet friendly couch : I looking replace co...,dogs,True
1,dogs,Hello beautiful dog owners!,I got offered a Pembroke puppy. She’s lower p...,1732061000.0,Hello beautiful dog owners! : I got offered Pe...,dogs,True


In [19]:
new_data = [
    "What breed is best for a small apartment?",
    "What are the top 5 ways to start for retirement?",
    "My pup loves playing fetch",
    "How can I invest in index funds?"
]

In [20]:
def predict_subreddit(new_posts):
    # Preprocess the new posts
    new_posts_processed = [preprocess_text(post) for post in new_posts]
    # Vectorize the preprocessed text
    new_posts_vec = vect.transform(new_posts_processed)
    # Predict the subreddits using the model
    predictions = lr_model.predict(new_posts_vec)
    return predictions

In [21]:
predictions = predict_subreddit(new_data)
for post, prediction in zip(new_data, predictions):
    print(f"Post: {post}\nPredicted subreddit: {prediction}\n")

Post: What breed is best for a small apartment?
Predicted subreddit: dogs

Post: What are the top 5 ways to start for retirement?
Predicted subreddit: personalfinance

Post: My pup loves playing fetch
Predicted subreddit: dogs

Post: How can I invest in index funds?
Predicted subreddit: personalfinance



In [33]:
df_new = pd.DataFrame(new_data, columns=["text"])

# Step 2: Predict subreddits using the previously defined predict_subreddit function
predictions = predict_subreddit(new_data)

# Step 3: Add the predictions as a new column
df_new["predicted_subreddit"] = predictions

# Step 4: Display the resulting DataFrame
df_new.head()


Unnamed: 0,text,predicted_subreddit
0,What breed is best for a small apartment?,dogs
1,What are the top 5 ways to start for retirement?,personalfinance
2,My pup loves playing fetch,dogs
3,How can I invest in index funds?,personalfinance


In [25]:
def predictd_subreddit(new_post):
    new_post_vec = vect.transform([new_post])
    predicted_subreddit = lr_model.predict(new_post_vec)
    return predicted_subreddit

In [27]:
prediction = predictd_subreddit(new_data)
for post, prediction in zip(new_data, predictions):
    print(f"Post: {post}\nPredicted subreddit: {prediction}\n")

AttributeError: 'list' object has no attribute 'lower'

In [28]:
def predictd(new_post):
    # Preprocess the input text
    new_post_processed = preprocess_text(new_post)
    
    # Transform the preprocessed text into the vectorized form
    new_post_vec = vect.transform([new_post_processed])
    
    # Predict the subreddit
    return lr_model.predict(new_post_vec)

In [29]:
predictd_po = predictd(new_post)
print(predictd_po)

NameError: name 'new_post' is not defined

In [30]:
pri = predictd(new_data)
print(pri)

AttributeError: 'list' object has no attribute 'split'

In [95]:
new = "What couch should I use?"

In [96]:
pred = predictd(new)
print(pred)

personalfinance


In [99]:
cat = 'best region to use'
predin = predictd(cat)
print(predin)

personalfinance


In [101]:
pup = 'best for furry friends'
pre = predictd(pup)
print(pre)

personalfinance
