In [None]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler

# Custom imports
from helper_functions import evaluate_clf, label_encode
from StylometryPrep import StylometryFeatureExtractor

In [None]:
DATA_PATH = "lai-data/political_leaning.csv"
FEATURE = "political_leaning"

In [None]:
df_politics = pd.read_csv(DATA_PATH).iloc[:3000] # remove iloc to test full dataset
df, le = label_encode(df_politics, FEATURE)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_politics['post'], df_politics[FEATURE],test_size= 0.3)

In [None]:
nlp = spacy.load("en_core_web_sm")

## 1. BoW classification

In [None]:
clf = Pipeline([
    ('vectorizer', CountVectorizer(stop_words='english')),
    ('svc', LinearSVC())
])

clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

## 2. Stylometry + BoW

In [None]:
# Create a pipeline
clf_stylometry = Pipeline([
    ('features', FeatureUnion([
        ('text', CountVectorizer()),  # You can include other text-based features here
        ('stylometry', StylometryFeatureExtractor())
    ])),
    ('scaler', StandardScaler(with_mean=False)),
    ('clf', LinearSVC())
])

# Fit the model
clf_stylometry.fit(X_train, y_train)

In [None]:
evaluate_clf(clf_stylometry, X_test, y_test, classes=le.classes_)

## 3. Pure Stylometry

pretty useless lol

In [None]:
# Create a pipeline
clf_pure_stylometry = Pipeline([
    ('features', FeatureUnion([
        ('stylometry', StylometryFeatureExtractor())
    ])),
    ('scaler', StandardScaler(with_mean=False)),
    ('clf', LinearSVC())
])

clf_pure_stylometry.fit(X_train, y_train)

In [None]:
evaluate_clf(clf_pure_stylometry, X_test, y_test, classes=le.classes_)

In [None]:
# Alex Jones test
sentence = "I don't like them putting chemicals in the water that turn the frigging frogs gay."

print(clf.predict([sentence])[0])
print(clf_stylometry.predict([sentence])[0])