In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

In [2]:
df=pd.read_csv('text_dataset.csv')
df

Unnamed: 0,label,cleaned_text,cleaned_subject,cleaned_title,token_count
0,1,donald trump met member nato go well moment ar...,news,watch trump shove foreign leader way get front...,71
1,0,washington reuters rick perry presidentelect d...,politicsnews,trump energy pick perry softens stance climate...,128
2,1,president obama blasted republican presidentia...,politics,obama finally build border wallbut there one p...,81
3,1,male idaho republican five daughter made creep...,news,republican lawmaker say rape wont cause pregna...,82
4,1,kellyanne conway tried spin white house press ...,news,watch chuck todd swat annoying kellyanne conwa...,75
...,...,...,...,...,...
13829,0,washington reuters member u congress party fri...,politicsnews,u lawmaker back syria strike demand plan trump,107
13830,1,far video 530000 view make content legitimate ...,politics,ups secretly fly refugee u middle east watch g...,110
13831,0,dec 27 story corrects say 55000 page email ins...,politicsnews,u appeal court revives clinton email suit,82
13832,0,madrid reuters spain high court said tuesday g...,worldnews,spanish court grant u extradition russian hack...,75


Vectorize text columns and combine in feature matrix

In [8]:
from scipy.sparse import hstack

# vectorize the text columns
tfidf_vectorizer_text = TfidfVectorizer()
tfidf_vectorizer_subject = TfidfVectorizer()
tfidf_vectorizer_title = TfidfVectorizer()

# transform each column
X_tfidf_text = tfidf_vectorizer_text.fit_transform(df['cleaned_text'])
X_tfidf_subject = tfidf_vectorizer_subject.fit_transform(df['cleaned_subject'])
X_tfidf_title = tfidf_vectorizer_title.fit_transform(df['cleaned_title'])

# Combine TF-IDF from each column into feature matrix
X_tfidf_combined = hstack([X_tfidf_text, X_tfidf_subject, X_tfidf_title])

Split into train and test set

In [9]:
# Define label
y = df['label']

# split into test and train where test i 20% and train is 80%
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_combined, y, test_size=0.2, random_state=42)

# Logistic Regression

Create and train Logistic Regression model

In [13]:
# create and train model
logistic_regression_model = LogisticRegression()
logistic_regression_model.fit(X_train, y_train)

# make predictions
y_pred = logistic_regression_model.predict(X_test)

View performance of model

In [14]:
# performance metrics
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1353
           1       1.00      1.00      1.00      1414

    accuracy                           1.00      2767
   macro avg       1.00      1.00      1.00      2767
weighted avg       1.00      1.00      1.00      2767

Confusion Matrix:
 [[1353    0]
 [   0 1414]]
Accuracy Score: 1.0


In [15]:
# get coefficients of the model for interpretation
feature_names = tfidf_vectorizer_text.get_feature_names_out().tolist() \
                + tfidf_vectorizer_subject.get_feature_names_out().tolist() \
                + tfidf_vectorizer_title.get_feature_names_out().tolist()

coefficients = logistic_regression_model.coef_.flatten()

feature_importance = dict(zip(feature_names, coefficients))

# Sort features by their values
sorted_feature_importance = sorted(feature_importance.items(), key=lambda item: item[1], reverse=True)

# top 10 positive and top 10 negative coefficients
print("Top 10 positive features: ", sorted_feature_importance[:10])
print("Top 10 negative features: ", sorted_feature_importance[-10:])

Top 10 positive features:  [('leftnews', 3.1685862579653468), ('middleeast', 2.276133256583355), ('usnews', 2.201176797923478), ('video', 0.3731530093545749), ('hillary', 0.24366927717940456), ('21st', 0.18112835199749105), ('obama', 0.1574365553299408), ('news', 0.13447411588866431), ('america', 0.1343298434437592), ('breaking', 0.12883778597317186)]
Top 10 negative features:  [('un', -0.10081584397879031), ('korea', -0.10814361379029246), ('eu', -0.11754634424191325), ('talk', -0.12023033263341393), ('factbox', -0.12198750600743422), ('north', -0.12292501364018923), ('china', -0.1354714561320958), ('say', -0.21406711602602185), ('worldnews', -7.697705389075868), ('politicsnews', -8.218068352560541)]


## Tuning Logistic Regression

In [16]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# Create a logistic regression model with 'liblinear' solver
logistic_regression = LogisticRegression(solver='liblinear')

# Create a dictionary of all values you want to test for C and penalty
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization parameter
    'penalty': ['l1', 'l2'],  # The norm used in the penalization
}

# Use GridSearchCV to search for the best parameters
grid_search = GridSearchCV(logistic_regression, param_grid, cv=5, verbose=2, n_jobs=-1)

# Fit the grid search model with your training data
grid_search.fit(X_train, y_train)

# Output the best parameters and the best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (


[CV] END ................................C=0.001, penalty=l1; total time=   0.5s
[CV] END ................................C=0.001, penalty=l1; total time=   0.5s
[CV] END ................................C=0.001, penalty=l1; total time=   0.6s
[CV] END ................................C=0.001, penalty=l2; total time=   0.6s
[CV] END ................................C=0.001, penalty=l1; total time=   0.6s
[CV] END ................................C=0.001, penalty=l1; total time=   0.6s
[CV] END ................................C=0.001, penalty=l2; total time=   0.7s
[CV] END ................................C=0.001, penalty=l2; total time=   0.7s
[CV] END ................................C=0.001, penalty=l2; total time=   0.3s
[CV] END ................................C=0.001, penalty=l2; total time=   0.4s
[CV] END .................................C=0.01, penalty=l2; total time=   0.5s
[CV] END .................................C=0.01, penalty=l2; total time=   0.4s
[CV] END ...................

In [17]:
# Use the best estimator to make predictions on the test set
y_pred = grid_search.best_estimator_.predict(X_test)

# Evaluate the model performance
accuracy = accuracy_score(y_test, y_pred)
conf_mat = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Output the evaluation results
print('Accuracy:', accuracy)
print('Confusion Matrix:\n', conf_mat)
print('Classification Report:\n', report)

Accuracy: 1.0
Confusion Matrix:
 [[1353    0]
 [   0 1414]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1353
           1       1.00      1.00      1.00      1414

    accuracy                           1.00      2767
   macro avg       1.00      1.00      1.00      2767
weighted avg       1.00      1.00      1.00      2767

