In [1]:
import pandas as pd
import numpy as np


In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
%ls

[0m[01;34mdrive[0m/  [01;34msample_data[0m/


In [4]:
# Define paths to point to the correct location in Google Drive
csv_path = '/content/drive/My Drive/Group8-Project4/Resources/FakeNews_Processed_Data.csv'
npy_path = '/content/drive/My Drive/Group8-Project4/vectorized_data.npy'


In [5]:
# Load the data frame
df = pd.read_csv(csv_path)

df.head(10)

Unnamed: 0,title_text,label
0,law enforcement high alert following threats c...,1
1,post votes hillary already,1
2,unbelievable obamas attorney general says char...,1
3,bobby jindal raised hindu uses story christian...,0
4,satan russia unvelis image terrifying new supe...,1
5,time christian group sues amazon splc designat...,1
6,dr ben carson targeted irs never audit spoke n...,1
7,house intel chair trumprussia fake story evide...,1
8,sports bar owner bans nfl gameswill show true ...,1
9,latest pipeline leak underscores dangers dakot...,1


In [6]:
# Extract labels
y = df['label'].values


In [7]:
# Load vectorized features
X = np.load(npy_path)

In [None]:
#check y
print(y)

[1 1 1 ... 0 0 1]


In [None]:
#checking that X has loaded properly
print(X[0])

[ 2.81908438e-02  2.75723021e-02  3.58001590e-02  6.53967410e-02
 -5.46097308e-02 -2.37619095e-02 -4.09275247e-03 -8.24435055e-02
  6.89467937e-02  8.02911073e-02 -2.15530396e-02 -1.22802809e-01
 -3.96617614e-02  5.43146245e-02 -9.78071019e-02  7.72967935e-02
 -1.07843494e-02  8.95663351e-02 -3.19761001e-02 -7.24404454e-02
  1.83907095e-02  3.65553647e-02  3.63810174e-02 -3.08289602e-02
  5.26063889e-03 -1.12113245e-02 -8.50686207e-02  4.42833193e-02
  4.32750955e-02 -6.67038513e-03  8.79115239e-03  1.32957762e-02
 -3.93132754e-02 -2.35816129e-02 -1.39553228e-03  1.34147163e-02
  5.09627163e-02  1.21055013e-02  2.41050366e-02  6.83943406e-02
  7.42554888e-02 -6.91036358e-02  1.33677557e-01 -1.94583535e-02
 -2.85491417e-03 -5.00720553e-02 -6.66472167e-02 -4.27507348e-02
 -2.42126845e-02  4.17479239e-02 -2.60889009e-02  3.14749368e-02
  2.81617939e-02 -8.21636000e-04  1.62940919e-02 -7.99699279e-04
 -6.70565292e-02 -5.35239205e-02  9.20308195e-03 -7.64516741e-02
 -1.03444858e-02  7.03349

# **Split our Data into Training and Testing**

In [8]:
from sklearn.model_selection import train_test_split

# Split the data - 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# **Create a logistic Regression Model**

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter=1000, random_state=1)
classifier

# **Fit training data to our model**

In [None]:
# Train the data
classifier.fit(X_train, y_train)

# **Validate the model using test data**

In [None]:
# Score the model
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.8749545115843832
Testing Data Score: 0.8685797463090039


# **Make Predictions**

In [None]:
 # Predict outcomes for test data set
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
0,1,1
1,1,0
2,0,0
3,1,1
4,1,1
...,...,...
14422,1,1
14423,1,1
14424,1,1
14425,1,1


In [None]:
from sklearn.metrics import accuracy_score
# Display the accuracy score for the test dataset.
accuracy_score(y_test, predictions)

0.8685797463090039

# **Optimsing the Model**

In [9]:
#define parameter grid
param_grid = {
    'C': np.logspace(-4, 4, 20),  # A range of regularization strengths
    'penalty': ['l1', 'l2'],  # L1 and L2 regularization
    'solver': ['liblinear', 'saga']  # Solvers that can handle both L1 and L2 penalty
}


In [10]:
#Initialize Logistic Regression and GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
grid_search = GridSearchCV(log_reg, param_grid, cv=5, verbose=0, n_jobs=-1)

In [11]:
#Fit the Grid Search to the Data
grid_search.fit(X_train, y_train)


In [12]:
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)


Best parameters: {'C': 10000.0, 'penalty': 'l2', 'solver': 'liblinear'}
Best score: 0.8783163522196432


In [14]:
#Use the best model found by the grid search to make predictions or further analyze
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)


In [15]:
#Evaluate the model on test data
from sklearn.metrics import accuracy_score, classification_report

print("Test set accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Test set accuracy: 0.8736397033340265
              precision    recall  f1-score   support

           0       0.87      0.87      0.87      7006
           1       0.88      0.88      0.88      7421

    accuracy                           0.87     14427
   macro avg       0.87      0.87      0.87     14427
weighted avg       0.87      0.87      0.87     14427



# **Save the Model Using joblib**

In [16]:
!pip install joblib




In [17]:
from joblib import dump

# Define the path in your Google Drive to save the model
model_path = '/content/drive/My Drive/Group8-Project4/my_logistic_regression_model.joblib'

# Save your model
dump(best_model, model_path)


['/content/drive/My Drive/Group8-Project4/my_logistic_regression_model.joblib']

In [None]:
#Loading the Model later
# from joblib import load

# # Same path where you saved it
# model_path = '/content/drive/My Drive/Group8-Project4/my_logistic_regression_model.joblib'

# # Load the model
# loaded_model = load(model_path)
