# Libraries

In [9]:
import json
with open('config.json') as f:
    config = json.load(f)
TOKENIZER_ID = config['TOKENIZER_ID']
nrows = None if config['nrows'] == 0 else config['nrows']
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split # to separate the dataset.
from transformers import BertTokenizer, BertModel
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import make_scorer,classification_report, accuracy_score,confusion_matrix, f1_score, precision_score, recall_score
from sklearn.metrics import roc_curve

# Data access

In [2]:
data=pd.read_csv('title_text.csv',nrows=nrows).drop(['Unnamed: 0'],axis=1)
data.head()
print(data.shape)

(42834, 3)


## Modification du text en données numériques. (BERT)
Nous partons du principe que les données ont étés clean et que nous pouvons nous concerntrer à créer un model.

Comme les informations principales pour définir des Fake news et des vrai news, se base sur le texte et le titre, nous devons nous focaliser sur ces informations. Il faut ainsi que nous transformons le text et le titre en données afin que notre model puisse utiliser des données numériques pour trouver la bonne réponse. Nous allons utiliser le BERT tokenizer.

In [3]:

X = data['text']

# Create the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
model_bert = BertModel.from_pretrained('bert-base-uncased')

# Set the maximum sequence length
max_seq_length = 512

# Truncate or pad the tokenized sequences
X = X.apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=max_seq_length, truncation=True, padding='max_length'))
print(X)

0        [101, 2005, 3071, 2060, 2084, 2672, 2010, 5211...
1        [101, 8755, 1006, 26665, 1007, 1011, 1037, 764...
2        [101, 4199, 1006, 26665, 1007, 1011, 2280, 353...
3        [101, 1037, 9410, 2277, 2326, 2012, 1996, 2358...
4        [101, 1996, 8398, 2317, 2160, 2003, 2085, 3985...
                               ...                        
42829    [101, 1006, 26665, 1007, 1011, 6041, 7206, 270...
42830    [101, 2085, 2008, 6221, 8398, 2038, 2915, 2370...
42831    [101, 1996, 2214, 10539, 2962, 2087, 4141, 210...
42832    [101, 2899, 1006, 26665, 1007, 1011, 4883, 401...
42833    [101, 2899, 1006, 26665, 1007, 1011, 1057, 101...
Name: text, Length: 42834, dtype: object


In [4]:
y = data['isFake'].astype(int) #make it numerical
print(y.value_counts())

1    21417
0    21417
Name: isFake, dtype: int64


# Data split

In [5]:
# we separate the trdata into training test and validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_test, X_validation, y_test, y_validation = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [6]:
X_train = np.array(X_train.tolist())
y_train= np.array(y_train.tolist())
print(X_train.shape,y_train.shape)

X_test = np.array(X_test.tolist())
y_test= np.array(y_test.tolist())
print(X_test.shape,y_test.shape)

X_validation = np.array(X_validation.tolist())
y_validation= np.array(y_validation.tolist())
print(X_validation.shape,y_validation.shape)


(34267, 512) (34267,)
(4283, 512) (4283,)
(4284, 512) (4284,)


# Model Logistic regression

In [7]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
Accuracy = logreg.score(X_test, y_test)
print("Accuracy Test Data: "+str(Accuracy*100)+"%")

Accuracy Test Data: 82.93252393182348%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
y_pred_validation=logreg.predict(X_validation)

# we calculate the accuracy of the model on the validation data
accuracy_validation=accuracy_score(y_validation, y_pred_validation)
print("Accuracy Validation Data: "+str(accuracy_validation*100)+"%")

Accuracy Validation Data: 83.73015873015873%


# Model Optimization
We create a grid search algorithm to find the best hyperparameters for the model.
The model is a logistic regression model with the following hyperparameters:
- C: the regularization strength, in simple terms, it is used to avoid overfitting. A low value of C means that the model will try to adjust to the majority of the data points, while a high value of C means that the model will try to adjust to the minority of the data points.
- max_iter: the maximum number of iterations, it is used to avoid overfitting. A low value of max_iter means that the model will try to adjust to the majority of the data points, while a high value of max_iter means that the model will try to adjust to the minority of the data points.

these parameters will be tested on the following values (randomly chosen):
- C: [0.01,0.05,0.1,0.5,1,5, 10,50,100,1000]
- max_iter: [100, 1000, 10000]

In [13]:
params_grid = {'C': [0.01,0.05,0.1,0.5,1,5], 
               'max_iter': [10,50,100,500,1000,1500]
               }


model = LogisticRegression()
# On creer un scorer pour le grid search 
scorer = make_scorer(accuracy_score)

# Initialisation de la GridSearch pour trouver le meilleur C.
grid_search = GridSearchCV(
    estimator=model, 
    param_grid=params_grid, # parametres à tester
    scoring=scorer, # scorer
    cv=5, # cv=5 pour utiliser la validation croisée à 5 folds 
    verbose=2, # verbose=2 pour afficher les logs
    n_jobs=-1, # n_jobs=-1 pour utiliser tous les coeurs du CPU
    return_train_score=True # pour afficher les scores d'entrainement
    )

# On lance les multiples entrainements.
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   50.6s


In [None]:
# Get the best combination of parameters
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best parameters: {best_params}")
print(f"Best score: {best_score}")

# Model Evaluation

In [None]:
# On recupere le meilleur modele
best_model = grid_search.best_estimator_

In [None]:
# On predit les donnees de test
y_pred_test = best_model.predict(X_test)

In [None]:
# On calcule l'accuracy
accuracy_test = accuracy_score(y_test, y_pred_test)
print(f"Accuracy Test Data: {accuracy_test*100}%")

In [None]:
# On predit les donnees de validation
y_pred_validation = best_model.predict(X_validation)
# On calcule l'accuracy
accuracy_validation = accuracy_score(y_validation, y_pred_validation)
print(f"Accuracy Validation Data: {accuracy_validation*100}%")

In [None]:
# on test toute les autres metric possible pour évaluer notre model de naive bayes.

# on fait le classification_report
print(classification_report(y_validation, y_pred_validation))
#on obtient la precision,recall et f1_score
precision = precision_score(y_validation, y_pred_validation)
recall = recall_score(y_validation, y_pred_validation)
f1 = f1_score(y_validation, y_pred_validation)

In [None]:
# On affiche la matrice de confusion en utilisant matplotlib
plt.figure(figsize=(5, 5))
sns.heatmap(confusion_matrix(y_test, y_pred_test), annot=True, fmt='d',)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# On affiche la courbe ROC

#on obtient y_pred 
y_pred = best_model.predict_proba(X_test)[:, 1]
#on obtient la valeur roc_auc
roc_auc = roc_auc_score(y_test, y_pred)

fpr, tpr, _ = roc_curve(y_test, y_pred)
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('Taux de Faux Positives (%)')
plt.ylabel('Taux de Vrai Positives (%)')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()

# Test de performances sur des données générées par Téléchargé ailleurs

In [None]:
validation_data=pd.read_csv("data/validation_data.csv",nrows=nrows)
validation_data=validation_data[['text','isFake']]

# Prepare the data
validation_data_X = validation_data['text'].apply(lambda x: x.lower())  # the text data
validation_data_y = validation_data['isFake']

# Tokenize the data using the BERT tokenizer
encoded_inputs = tokenizer(validation_data_X.tolist(), padding=True, truncation=True, max_length=max_seq_length, return_tensors='np')

# Get the input IDs and attention masks
input_ids = encoded_inputs['input_ids']
attention_masks = encoded_inputs['attention_mask']

# Use the trained model to make predictions on the dataset
predictions = best_model.predict(input_ids)


In [1]:
#we print the accuracy of the model
accuracy = accuracy_score(validation_data_y, predictions)
print(f"Accuracy different Dataset: {accuracy*100}%")

NameError: name 'accuracy_score' is not defined

In [None]:
cm = confusion_matrix(validation_data_y,predictions)
#représentation graphique du résultat du meilleure model
sns.heatmap(cm, annot=True)
plt.xlabel('Les labels prédits')
plt.ylabel('Les vrais labels')
plt.title('Matrice de confusion')
plt.show()

In [None]:
#we print the percentage of True Positive, True Negative, False Positive and False Negative
tn, fp, fn, tp = cm.ravel()
print(f"True Negative: {tn}")
print(f"False Positive: {fp}")
print(f"False Negative: {fn}")
print(f"True Positive: {tp}")

Conclusion