In [None]:
import pandas as pd
raw_df = pd.read_csv('train_unbalanced.csv')
data = pd.read_csv('test_main.csv')
raw_df.head()

In [None]:
print(raw_df.shape)
df = raw_df[['content', 'Pro Trump', 'Pro Biden', 'Neutral']]
df = df[df['Pro Biden'].notna()]
df = df[df['content'].notna()]
df = df[df['Pro Trump'].notna()]
df = df[df['Neutral'].notna()]
print(df.shape)
df = df.astype({"Pro Trump": int, "Pro Biden": int, "Neutral": int})
df.drop_duplicates(subset='content', keep='first', inplace=True)
print(df.shape)
df.head()

In [None]:
cols = df.columns
df = df[:1000]
label_cols = list(cols[1:])
num_labels = len(label_cols)
print('Label columns: ', label_cols)

In [None]:
print('Count of 1 per label: \n', df[label_cols].sum(), '\n') # Label counts, may need to downsample or upsample
print('Count of 0 per label: \n', df[label_cols].eq(0).sum())

In [None]:
df = df.sample(frac=1).reset_index(drop=True) #shuffle rows

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer

token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(lowercase=True,stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)

In [None]:
from sklearn.model_selection import train_test_split
train_df = df.copy()
count = len(train_df)
text = cv.fit_transform(pd.concat([train_df['content'], data['content']], axis=0))
train_text, test_text = text[:count], text[count:]

In [None]:
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB

model = ExtraTreesClassifier(n_estimators=100, random_state=1)
# model = MultinomialNB()
classifier = MultiOutputClassifier(model, n_jobs=-1)
# classifier.fit(train_text.toarray(), train_df[label_cols][:1000])
#  Below one for Tree algo
classifier.fit(train_text, train_df[label_cols][:])

In [None]:
from sklearn import metrics
predicted = classifier.predict(train_text.toarray())
#  Below one for Tree algo
# predicted = classifier.predict(train_text)
print("Test Accuracy:",metrics.accuracy_score(train_df[label_cols][:1000].values, predicted))

In [None]:
predicted= classifier.predict(test_text.toarray())
from sklearn.metrics import average_precision_score, accuracy_score
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.metrics import plot_confusion_matrix, confusion_matrix
print('F1 score: ', f1_score(data[label_cols], predicted,average='micro'))
print('Accuracy: ', accuracy_score(data[label_cols], predicted))
print('Precision: ', precision_score(data[label_cols], predicted, average='micro'))
print('Recall: ', recall_score(data[label_cols], predicted, average='micro'))

In [None]:
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
true_bools1 = np.array(data[label_cols])
pred_bools1 = np.array(predicted)
cm = confusion_matrix(true_bools1.argmax(axis=1), pred_bools1.argmax(axis=1))
ax= plt.subplot()
sns.heatmap(cm, annot=True, ax = ax, cmap='Blues', fmt="d")

ax.set_title('Confusion Matrix')

ax.set_xlabel('Predicted Labels')
ax.set_ylabel('True Labels')

ax.xaxis.set_ticklabels(['Pro Trump', 'Pro Biden', 'Neutral'])
ax.yaxis.set_ticklabels(['Pro Trump', 'Pro Biden', 'Neutral'])