In [7]:
#Importing relevant libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
features_df = pd.read_csv('features_dataset.csv')
features_df

Unnamed: 0,label,TTR,text_polarity,text_subjectivity,title_polarity,title_subjectivity,doc_perplexity,1_grams,average_word_length
0,1,0.844037,0.091481,0.524184,-0.125000,0.125000,0.867938,"['donald', 'trump', 'met', 'member', 'nato', '...",6.490826
1,0,0.683438,0.003001,0.343395,0.000000,0.000000,0.882860,"['washington', 'reuters', 'rick', 'perry', 'pr...",6.385744
2,1,0.721030,0.073622,0.401345,0.000000,1.000000,0.883760,"['president', 'obama', 'blasted', 'republican'...",6.072961
3,1,0.667638,0.037264,0.462935,0.000000,1.000000,0.882882,"['male', 'idaho', 'republican', 'daughter', 'c...",6.469388
4,1,0.628032,-0.018966,0.479310,-0.800000,0.900000,0.883120,"['kellyanne', 'conway', 'tried', 'spin', 'whit...",6.296496
...,...,...,...,...,...,...,...,...,...
13829,0,0.659631,-0.030698,0.380595,0.000000,0.000000,0.882810,"['washington', 'reuters', 'member', 'congress'...",6.514512
13830,1,0.629126,-0.009035,0.312617,-0.066667,0.633333,0.895444,"['far', 'video', '530000', 'view', 'make', 'co...",6.782524
13831,0,0.595745,0.049287,0.244648,0.000000,0.000000,0.883645,"['dec', '27', 'story', 'corrects', 'say', '550...",6.117021
13832,0,0.696581,-0.026939,0.293520,0.000000,0.000000,0.895276,"['madrid', 'reuters', 'spain', 'high', 'court'...",6.478632


In [4]:
features_df.dtypes

label                    int64
TTR                    float64
text_polarity          float64
text_subjectivity      float64
title_polarity         float64
title_subjectivity     float64
doc_perplexity         float64
1_grams                 object
average_word_length    float64
dtype: object

# SVM feature based model

Because 1_grams is text based it needs to be vectorized to enable the model to understand it.
Therefore we split the dataset in both numerical and text-based features, and the we vectorize the 1_grams.

In [8]:
# List of features without '1_grams'
num_features = ['TTR', 'text_polarity', 'text_subjectivity', 'title_polarity', 'title_subjectivity', 'doc_perplexity', 'average_word_length']

# Preprocessor that applies different transformations to different columns - TF-IDF is used to convert 1_gram values to a fitting datatype
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),  # Apply standard scaling to numerical features
        ('txt', TfidfVectorizer(), '1_grams')     # Apply TF-IDF to the '1_grams' text data
    ])

# Full pipeline with preprocessor and SVM
pipeline = make_pipeline(
    preprocessor,
    SVC(kernel='linear', C=1.0)
)


In [10]:
# Here we define the feature matrix and target vector
X = features_df[num_features + ['1_grams']]  # Include all features plus '1_grams'
y = features_df['label']

# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model on the training data
pipeline.fit(X_train, y_train)



In [11]:
# Use predict to use the model and test its prediction ability
predictions = pipeline.predict(X_test)

# Print accuracy and classification report
print("Accuracy:", accuracy_score(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions))


Accuracy: 0.9924105529454282
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      1353
           1       0.99      0.99      0.99      1414

    accuracy                           0.99      2767
   macro avg       0.99      0.99      0.99      2767
weighted avg       0.99      0.99      0.99      2767

