## Load Data from Reddit and Twitter datasets 
Load data and clean words

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import nltk
from nltk.corpus import stopwords
import nltk as nlp
import sklearn

stopwords = stopwords.words('english')
#add some unnecessary word to stopwords list
stopwords.append("rt")
stopwords.append("u")
stopwords.append("amp")
stopwords.append("w")
stopwords.append("th")

twitter_data=pd.read_csv("twitter/ExtractedTweets.csv")
reddit_data=pd.read_csv("reddit/file_name.csv")
reddit_data["Text"] = reddit_data["Text"].fillna('')
reddit_data["Title"] = reddit_data["Title"].fillna('')
reddit_data['Post'] = reddit_data['Title'] + ' ' + reddit_data['Text']

# Cleaning posts 
def clean(post):
    d=post
    d=re.sub(r'http\S+', '', d) #remove links
    d=re.sub("[^a-zA-Z]", " ", d) #remove all characters except letters
    d=d.lower() #convert all words to lowercase
    d=nltk.word_tokenize(d) #split sentences into word
    d=[word for word in d if not word in set(stopwords)] #add to stopwords list if unnecessary words.
    lemma=nlp.WordNetLemmatizer() 
    d=[lemma.lemmatize(word) for word in d] #identify the correct form of the word in the dictionary
    d=" ".join(d)
    return d

reddit_data['Post'] = reddit_data['Post'].apply(clean)
twitter_data['Tweet'] = twitter_data['Tweet'].apply(clean)

## Split training and test sets

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

reddit_df = reddit_data.copy()
reddit_df["X"] = reddit_df['Post']
reddit_df["y"] = reddit_df['Political Lean']
reddit_df['y'] = reddit_df['y'].map({"Liberal": 'Democrat', "Conservative": 'Republican'})

twitter_df = twitter_data.copy()
twitter_df["X"] = twitter_df["Tweet"]
twitter_df["y"] = twitter_df["Party"]

reddit_X_train, reddit_X_test, reddit_y_train, reddit_y_test = train_test_split(reddit_df['X'],
                                                    reddit_df['y'],
                                                    stratify=reddit_df['y'],
                                                    test_size=0.2,
                                                    random_state=1218) # use my birthday!

twitter_X_train, twitter_X_test, twitter_y_train, twitter_y_test = train_test_split(twitter_df['X'],
                                                    twitter_df['y'],
                                                    stratify=twitter_df['y'],
                                                    test_size=0.2,
                                                    random_state=1218) # use my birthday!

## Model Training
Train Naive Bayes classifier on Twitter and Reddit data respectively

In [3]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, cohen_kappa_score
from sklearn.model_selection import cross_validate
import json

def fit_model(X_train, y_train, clf, vocabulary=None):
    vectorizer = CountVectorizer(vocabulary=vocabulary) if vocabulary else CountVectorizer()
    post_clf = Pipeline([
        ('vect', vectorizer),
        ('tfidf', TfidfTransformer()),
        ("clf", clf)
    ])
    post_clf.fit(X_train,y_train)
    return post_clf

def evaluate(model_name, y_test, y_pred, labels):
    print("Model: {}\n".format(model_name))
    print(classification_report(y_test, y_pred, target_names=labels))
    kappa = cohen_kappa_score(y_test, y_pred)
    print("Cohen's Kappa:", round(kappa,2))
    print("========================================================\n")

In [4]:
clf_twitter_train = fit_model(twitter_X_train, twitter_y_train, MultinomialNB(alpha = 0.1, fit_prior = True))
clf_reddit_train = fit_model(reddit_X_train, reddit_y_train, MultinomialNB(alpha = 0.1, fit_prior = True))

## Test
Do cross dataset test

In [8]:
for train_d, clf in zip(["Twitter", "Reddit"], [clf_twitter_train, clf_reddit_train]):
    for test_d, test_input, gt in [["Twitter", twitter_X_test, twitter_y_test], ["Reddit", reddit_X_test, reddit_y_test]]:
        evaluate("Naive Bayes, Training Set: {}, Test Set: {}".format(train_d, test_d), gt, clf.predict(test_input), ["Democrat", "Republican"])

Model: Naive Bayes, Training Set: Twitter, Test Set: Twitter

              precision    recall  f1-score   support

    Democrat       0.82      0.78      0.80      8414
  Republican       0.80      0.83      0.82      8878

    accuracy                           0.81     17292
   macro avg       0.81      0.81      0.81     17292
weighted avg       0.81      0.81      0.81     17292

Cohen's Kappa: 0.61

Model: Naive Bayes, Training Set: Twitter, Test Set: Reddit

              precision    recall  f1-score   support

    Democrat       0.68      0.69      0.68      1664
  Republican       0.41      0.40      0.41       907

    accuracy                           0.59      2571
   macro avg       0.54      0.54      0.54      2571
weighted avg       0.58      0.59      0.59      2571

Cohen's Kappa: 0.09

Model: Naive Bayes, Training Set: Reddit, Test Set: Twitter

              precision    recall  f1-score   support

    Democrat       0.49      0.89      0.63      8414
  Republica