In [1]:
'''
id: unique id for a news article
title: the title of a news article
author: author of the news article
text: the text of the article; could be incomplete
label: a label that marks the article as potentially unreliable: 1 = fake news and 0 = real news
'''

'\nid: unique id for a news article\ntitle: the title of a news article\nauthor: author of the news article\ntext: the text of the article; could be incomplete\nlabel: a label that marks the article as potentially unreliable: 1 = fake news and 0 = real news\n'

In [2]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
'''import nltk
nltk.download('stopwords')
'''

"import nltk\nnltk.download('stopwords')\n"

In [4]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [5]:
news_dataset = pd.read_csv('train.csv')

In [6]:
news_dataset.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [7]:
news_dataset.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [8]:
#replace null values with empty string - usually you dont drop null values but this is a large dataset so we can still train model
news_dataset = news_dataset.fillna('')

In [9]:
news_dataset.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [10]:
#merging the author name and title columns - later on add the text column too to see what happens
news_dataset['content'] = news_dataset['text']

In [11]:
news_dataset.head(2)

Unnamed: 0,id,title,author,text,label,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,Ever get the feeling your life circles the rou...


In [12]:
#seperating content and label columns
X = news_dataset.drop(columns = 'label', axis = 1)
Y = news_dataset['label']

In [13]:
print(X,Y)

          id                                              title  \
0          0  House Dem Aide: We Didn’t Even See Comey’s Let...   
1          1  FLYNN: Hillary Clinton, Big Woman on Campus - ...   
2          2                  Why the Truth Might Get You Fired   
3          3  15 Civilians Killed In Single US Airstrike Hav...   
4          4  Iranian woman jailed for fictional unpublished...   
...      ...                                                ...   
20795  20795  Rapper T.I.: Trump a ’Poster Child For White S...   
20796  20796  N.F.L. Playoffs: Schedule, Matchups and Odds -...   
20797  20797  Macy’s Is Said to Receive Takeover Approach by...   
20798  20798  NATO, Russia To Hold Parallel Exercises In Bal...   
20799  20799                          What Keeps the F-35 Alive   

                                          author  \
0                                  Darrell Lucus   
1                                Daniel J. Flynn   
2                             Consortiu

###### Stemming:
    
Stemming is the process of reducing a word to its root word

example:
actor, actress, acting --> will get turned to act

In [14]:
port_stem = PorterStemmer()

In [15]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content) #this regex is looking for words from a-z only. no numbers. commas and fullstops are replaced with a space as indicated by ' '
    stemmed_content = stemmed_content.lower()#convert everything to lowercase letters
    stemmed_content = stemmed_content.split()#convert everything in content to a list
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')] #reducing words to their root word - but for loop is removing all stop words
    stemmed_content = ' '.join(stemmed_content)#joining all words
    return stemmed_content

In [None]:
news_dataset['content'] = news_dataset['content'].apply(stemming)

In [None]:
print(news_dataset['content'])

In [None]:
#seperating data and label - youre not dropping the label column here bc the only thing youre doing is comparing it to content
#you would drop the label to make it easier to seperate the data if you were comparing label to multiple columns
X = news_dataset['content'].values
Y = news_dataset['label'].values

In [None]:
print(X,Y)

In [None]:
#converting text data to numbers
#main point of vectorizer is that it will create feature columns that it deems is important
vectorizer = TfidfVectorizer() #basically finds words that are repeating the most to assign a value to it. similarly its inversely doing the opposite where if certain words are showing up and it doesnt have a value, it doesnt provide it value
vectorizer.fit(X) #only fitting X bc Y already is all numbers (0,1)

X = vectorizer.transform(X) #convert all values to respective features

In [None]:
print(X.shape)

Splitting data to train and test

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = .2, stratify = Y )

In [None]:
print(X.shape, X_train.shape, X_test.shape)

###### Model Selection

In [None]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix 
from sklearn import svm
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Define parameter grids for randomized search (coarse search)
logistic_param_grid_coarse = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga']
}

xgboost_param_grid_coarse = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.5, 0.8, 1.0],
    'colsample_bytree': [0.5, 0.8, 1.0]
}

svm_param_grid_coarse = {
    'C': [0.1, 1, 10],
    'gamma': [0.01, 0.1, 1],
    'kernel': ['linear', 'rbf']
}

# RandomizedSearchCV for each model (coarse search)
logistic_random_search_coarse = RandomizedSearchCV(LogisticRegression(), logistic_param_grid_coarse, n_iter=30, cv=5, n_jobs=-1)
xgboost_random_search_coarse = RandomizedSearchCV(XGBClassifier(objective='binary:logistic'), xgboost_param_grid_coarse, n_iter=30, cv=5, n_jobs=-1)
svm_random_search_coarse = RandomizedSearchCV(svm.SVC(), svm_param_grid_coarse, n_iter=30, cv=5, n_jobs=-1)

# Fit models using RandomizedSearchCV (coarse search)
logistic_random_search_coarse.fit(X_train, Y_train)
xgboost_random_search_coarse.fit(X_train, Y_train)
svm_random_search_coarse.fit(X_train, Y_train)

# Get best hyperparameters from RandomizedSearchCV (coarse search)
best_logistic_params_coarse = logistic_random_search_coarse.best_params_
best_xgboost_params_coarse = xgboost_random_search_coarse.best_params_
best_svm_params_coarse = svm_random_search_coarse.best_params_

# Define parameter grids for GridSearchCV (fine search)
logistic_param_grid_fine = {
    'penalty': [best_logistic_params_coarse['penalty']],
    'C': [best_logistic_params_coarse['C'] * i for i in [0.1, 1, 10]],
    'solver': [best_logistic_params_coarse['solver']]
}

xgboost_param_grid_fine = {
    'learning_rate': [best_xgboost_params_coarse['learning_rate'] * i for i in [0.5, 1, 2]],
    'n_estimators': [best_xgboost_params_coarse['n_estimators']],
    'max_depth': [best_xgboost_params_coarse['max_depth']],
    'min_child_weight': [best_xgboost_params_coarse['min_child_weight']],
    'subsample': [best_xgboost_params_coarse['subsample']],
    'colsample_bytree': [best_xgboost_params_coarse['colsample_bytree']]
}

svm_param_grid_fine = {
    'C': [best_svm_params_coarse['C'] * i for i in [0.1, 1, 10]],
    'gamma': [best_svm_params_coarse['gamma'] * i for i in [0.1, 1, 10]],
    'kernel': [best_svm_params_coarse['kernel']]
}

# GridSearchCV for each model (fine search)
logistic_grid_search_fine = GridSearchCV(LogisticRegression(), param_grid=logistic_param_grid_fine, cv=5, n_jobs=-1)
xgboost_grid_search_fine = GridSearchCV(XGBClassifier(objective='binary:logistic'), param_grid=xgboost_param_grid_fine, cv=5, n_jobs=-1)
svm_grid_search_fine = GridSearchCV(svm.SVC(), param_grid=svm_param_grid_fine, cv=5, n_jobs=-1)

# Fit models using GridSearchCV (fine search)
logistic_grid_search_fine.fit(X_train, Y_train)
xgboost_grid_search_fine.fit(X_train, Y_train)
svm_grid_search_fine.fit(X_train, Y_train)

# Print best hyperparameters from GridSearchCV (fine search)
print("Logistic Regression Best Parameters (Fine Search):", logistic_grid_search_fine.best_params_)
print("XGBoost Best Parameters (Fine Search):", xgboost_grid_search_fine.best_params_)
print("SVM Best Parameters (Fine Search):", svm_grid_search_fine.best_params_)

# Compare cross-validated scores of each model
logistic_cv_score_fine = logistic_grid_search_fine.best_score_
xgboost_cv_score_fine = xgboost_grid_search_fine.best_score_
svm_cv_score_fine = svm_grid_search_fine.best_score_

# Select the best model based on cross-validated scores
best_model_fine = None
if logistic_cv_score_fine >= xgboost_cv_score_fine and logistic_cv_score_fine >= svm_cv_score_fine:
    best_model_fine = logistic_grid_search_fine.best_estimator_
elif xgboost_cv_score_fine >= logistic_cv_score_fine and xgboost_cv_score_fine >= svm_cv_score_fine:
    best_model_fine = xgboost_grid_search_fine.best_estimator_
else:
    best_model_fine = svm_grid_search_fine.best_estimator_

# Evaluate the best model on the test set
train_accuracy_fine = best_model_fine.score(X_train, Y_train)
print("Best Model Train Accuracy (Fine Search):", train_accuracy_fine)
test_accuracy_fine = best_model_fine.score(X_test, Y_test)
print("Best Model Test Accuracy (Fine Search):", test_accuracy_fine)




In [25]:
import joblib

# Save the best model
joblib.dump(best_model_fine, 'text_only.pkl')


['best_model_fine.pkl']

Training model - Logistic Regression

In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train, Y_train)

Model Evaluation

In [None]:
#prediction on training data
X_train_prediction = model.predict(X_train)
training_data_prediction = accuracy_score(X_train_prediction, Y_train)

In [None]:
print("Accuracy score of the training data:", training_data_prediction)

In [None]:
#prediction on test data
X_test_prediction = model.predict(X_test)
test_data_prediction = accuracy_score(X_test_prediction, Y_test)

In [None]:
print('Accuracy score of the test data: ', test_data_prediction)

Make a Predictive System

In [28]:
X_new = X_test[70]
print(X_new.shape)


prediction = best_model_fine.predict(X_new)
print(prediction)

if (prediction[0]==0):
    print('This is real news')
else:
    print('This is fake news')

(1, 132672)
[0]
This is real news


In [27]:
print (Y_test[70])

0
