In [1]:
# imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import train_test_split

%matplotlib inline

In [2]:
# reading data

df = pd.read_csv('train.csv')
submission = pd.read_csv('submission.csv')
submission.shape

(79707, 6)

In [3]:
# cleaning up df
df =df.dropna()
df['Body'].astype(str)

df.head(5)

Unnamed: 0.1,Unnamed: 0,Author,Body,Created,Post,Score,Subreddit
0,0,BobSponge22,"Even when there's 0 active cases, they'll stil...",1589775859,Democrats Have Abandoned Civil Liberties,1,Republican
1,1,MsWumpkins,Yup. It's awful.,1585607566,"Donald Trump’s Briefing Snubbed, Critics No Lo...",2,democrats
2,2,mickey_patches,"I agree with what you say, more or less, but I...",1485285435,Top Republican Says Balancing The Budget As Im...,3,Republican
3,3,sdrawkcabemanresu11,No. [Source](https://www.govtrack.us/congress/...,1493234043,House GOP health bill changes exempt members o...,39,Republican
4,4,Mhunterjr,"Don't tell me which Rich, Racist, republic oli...",1581998270,Bloomberg is avoiding all scrutiny. It's time ...,4,democrats


In [19]:
# converting subreddit vals to ints MAKE SURE DEMOCRATS IS SET TO 1 FOR SUBMISSION

index_democrat = df[df['Subreddit'] == 'democrats'].index
df['Subreddit'] =0 
df.loc[index_democrat, 'Subreddit'] =1

In [20]:
# splitting data
x = df['Body']
y = df['Subreddit']
xtr, xts, ytr, yts = train_test_split(x, y, test_size=0.30, stratify=y)

In [21]:
word_vectorizer = TfidfVectorizer(max_features=10000, lowercase=True, stop_words= 'english', ngram_range=(1,1))

# creating tf-idf matrices
xtr_vect = word_vectorizer.fit_transform(xtr)
xts_vect = word_vectorizer.fit_transform(xts)
xts_vect.shape

(54312, 10000)

In [22]:
lr = LogisticRegression(max_iter=10000, class_weight=({0:1}))
lr.fit(xtr_vect, ytr)
preds = lr.predict(xts_vect)

print('\nConfusion matrix\n',confusion_matrix(yts, preds))
print(f'f1 score: {f1_score(yts, preds)}')


Confusion matrix
 [[15777 12135]
 [13453 12947]]
f1 score: 0.5029719125131114


In [23]:
sub_input = submission['Body'].astype(str)
sub_vec = word_vectorizer.fit_transform(sub_input)
submission['Subreddit'] = lr.predict(sub_vec)

In [24]:
print(submission.shape)
submission.head()

(79707, 7)


Unnamed: 0.1,Unnamed: 0,Author,Body,Created,Post,Score,Subreddit
0,0,Picklesadog,And my argument is that the majority of democr...,1501799501,Poll: McCain more popular with Democrats than ...,1,1
1,1,,I’m not necessarily disagreeing with you but d...,1590457879,Paying Illegal Immigrants Puts America Last,1,0
2,2,Stunkstank,That’s not how the constitution works. Are you...,1523294396,Is Gun Ownership a Right?,1,0
3,3,viverator,Johnny five need input.,1587698971,“Senate Republicans slipped a tax break for we...,1,1
4,4,Broke_Poetry,Their jobs and pensions are being threatened o...,1598166788,Washington Postal Workers Defy USPS Orders And...,3,0


In [27]:
submission['Subreddit'].to_csv('sample_submission.csv')