In [8]:
from textblob import TextBlob
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import re
nltk.download('punkt')
nltk.download('stopwords')
import scipy.sparse as sp
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\diede\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\diede\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [24]:
data = pd.read_csv('political_leaning.csv')
#function that does for a given string do the pre processing (lowercase,puncatation, tokenizing and stopwords
def prepro(text): 
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    processed_text = ' '.join(filtered_tokens)
    return processed_text
#a function that uses Textblob to perform basic sentiment analysis, note that we converge sentiment here to a qualtative statement
#this was however not very smart, as our SVM needs a qaulitative stament, however as it took a long time to run this model, we converge this back in a later stage. 
def sentiment(text):
    blob = TextBlob(text)
    sentiment_polarity = blob.sentiment.polarity
    sentiment_subjectivity = blob.sentiment.subjectivity
    if sentiment_subjectivity > 0.5:
        sentiment_subjectivity = 'SUBJECTIVE'
    else:
        sentiment_subjectivity = 'NEUTRAL'
    if sentiment_polarity > 0.4:
        sentiment_polarity = 'POS'
    elif sentiment_polarity < -0.4:
        sentiment_polarity = 'NEG'
    else: 
        sentiment_polarity = 'NEUTRAL'
    return sentiment_polarity, sentiment_subjectivity



In [26]:
#test if sentiment worked
sentiment(data['post'][1])[1]

'SUBJECTIVE'

In [27]:
#adding new columns 
data['processed_text'] = ''
data['sentiment_polarity'] = ''
data['sentiment_subjectivity'] =''
#applying the functions
for x in range(0,len(data)): 
    data['processed_text'][x] = prepro(data['post'][x])
    data['sentiment_polarity'][x] = sentiment(data['post'][x])[0]
    data['sentiment_subjectivity'][x] = sentiment(data['post'][x])[1]

In [29]:
#saving the data to csv (so we don't need to run the code again)
data.to_csv('processed.csv', index=False)

In [4]:
#reading the data in and the for  deleting 80% percent (for perfomance issues)
data = pd.read_csv('processed.csv')
#data = data.sample(frac=0.2, random_state=42) 
#installing the encoder for sentiment
encoder = OneHotEncoder()
X_sentiment = encoder.fit_transform(data[['sentiment_polarity', 'sentiment_subjectivity']])
#vectorizing our strings of text
vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(data['processed_text'])
#stacking the text and sentiment 
X_combined = sp.hstack([X_vectorized, X_sentiment], format='csr')
#target variable declaration
y = data['political_leaning']
#splitting data test and train
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

In [5]:
#training the model
clf = svm.SVC(kernel='linear')
clf.fit(X_train, y_train)

In [11]:
#getting the names of the features
feature_names = vectorizer.get_feature_names_out()
#seeing the coefficients in the model
svm_coefs = clf.coef_.toarray()
#creating a dataframe for every word and its coefficient 
coefs_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': svm_coefs[0]})

# Sort by absolute values of coefficients to find most influential words
most_important_words = coefs_df.reindex(coefs_df.Coefficient.abs().sort_values(ascending=False).index)

print(most_important_words.head(10)) 

             Feature  Coefficient
105301       fucking    -2.162276
142683          isnt    -2.153745
91942            etc     1.983148
71709   deliberately    -1.979859
304852         whole    -1.975374
76648       disagree     1.861804
49117       campaign    -1.848820
200992      orthodox    -1.847472
284922           two    -1.838165
36113         beeing    -1.816784


In [6]:

# Predictions and Evaluation
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      center       0.60      0.80      0.68       974
        left       0.74      0.45      0.56       609
       right       0.64      0.57      0.60       707

    accuracy                           0.63      2290
   macro avg       0.66      0.60      0.62      2290
weighted avg       0.65      0.63      0.63      2290



In [10]:
#here we also train the model for logistic regression (to compare)
logistic = LogisticRegression(max_iter=1000)  
logistic.fit(X_train, y_train)

# Predict on the test set
y_pred = logistic.predict(X_test)

# Evaluation
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      center       0.58      0.81      0.68       974
        left       0.73      0.39      0.51       609
       right       0.64      0.55      0.59       707

    accuracy                           0.62      2290
   macro avg       0.65      0.58      0.59      2290
weighted avg       0.64      0.62      0.61      2290

