In [25]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords as stpw
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from scipy.sparse import hstack
from scipy.sparse import csr_matrix
import joblib

In [26]:
# Import the required prelabelled sonadataset
sona_df = pd.read_csv('sonadataset.csv',encoding='ISO-8859-1')
sona_df.head()

Unnamed: 0,id,date,speaker,claims,label
0,0,2001,H.E. John Agyekum Kufuor,The economy inherited was in a poor state with...,1
1,1,2001,H.E. John Agyekum Kufuor,The total debt stock stood at 41.1 trillion ce...,0
2,2,2001,H.E. John Agyekum Kufuor,The government is committed to democratic gove...,1
3,3,2001,H.E. John Agyekum Kufuor,The government will focus on national reconcil...,1
4,4,2001,H.E. John Agyekum Kufuor,Private sector development will be the main en...,1


In [27]:
sona_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2609 entries, 0 to 2608
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       2609 non-null   int64 
 1   date     2609 non-null   int64 
 2   speaker  2609 non-null   object
 3   claims   2609 non-null   object
 4   label    2609 non-null   int64 
dtypes: int64(3), object(2)
memory usage: 102.0+ KB


In [28]:
sona_df['label'].value_counts()

label
1    2489
0     120
Name: count, dtype: int64

In [29]:
sona_df.shape

(2609, 5)

In [30]:
# Checking the data for missing valuessona_df.isna().sum()

In [31]:
sona_df.head()

Unnamed: 0,id,date,speaker,claims,label
0,0,2001,H.E. John Agyekum Kufuor,The economy inherited was in a poor state with...,1
1,1,2001,H.E. John Agyekum Kufuor,The total debt stock stood at 41.1 trillion ce...,0
2,2,2001,H.E. John Agyekum Kufuor,The government is committed to democratic gove...,1
3,3,2001,H.E. John Agyekum Kufuor,The government will focus on national reconcil...,1
4,4,2001,H.E. John Agyekum Kufuor,Private sector development will be the main en...,1


In [32]:
sona_df = sona_df.drop(['id'], axis = 1)

In [33]:
sona_df.head()

Unnamed: 0,date,speaker,claims,label
0,2001,H.E. John Agyekum Kufuor,The economy inherited was in a poor state with...,1
1,2001,H.E. John Agyekum Kufuor,The total debt stock stood at 41.1 trillion ce...,0
2,2001,H.E. John Agyekum Kufuor,The government is committed to democratic gove...,1
3,2001,H.E. John Agyekum Kufuor,The government will focus on national reconcil...,1
4,2001,H.E. John Agyekum Kufuor,Private sector development will be the main en...,1


In [34]:
# Encode the speaker labels
label_encoder = LabelEncoder()
sona_df['encoded_speaker'] = label_encoder.fit_transform(sona_df['speaker'])
sona_df

Unnamed: 0,date,speaker,claims,label,encoded_speaker
0,2001,H.E. John Agyekum Kufuor,The economy inherited was in a poor state with...,1,2
1,2001,H.E. John Agyekum Kufuor,The total debt stock stood at 41.1 trillion ce...,0,2
2,2001,H.E. John Agyekum Kufuor,The government is committed to democratic gove...,1,2
3,2001,H.E. John Agyekum Kufuor,The government will focus on national reconcil...,1,2
4,2001,H.E. John Agyekum Kufuor,Private sector development will be the main en...,1,2
...,...,...,...,...,...
2604,2024,H. E. Nana Addo Dankwa Akufo-Addo,some six hundred and ninety thousand hectares ...,1,1
2605,2024,H. E. Nana Addo Dankwa Akufo-Addo,Government has directly intervened to stimulat...,1,1
2606,2024,H. E. Nana Addo Dankwa Akufo-Addo,this has led to the development of three hundr...,1,1
2607,2024,H. E. Nana Addo Dankwa Akufo-Addo,these business promoters have so far invested ...,1,1


In [35]:
# Data Preprocessing functions
lm = WordNetLemmatizer()
ps = PorterStemmer()
stopwords = stpw.words('english')
preprocessed_list = []


In [36]:
for claim in range(len(sona_df)):
    review = re.sub(r'[^a-zA-Z0-9]',' ',sona_df['claims'][claim])
    # LowerCase
    review = review.lower()
    # Tokenization
    review = review.split()
    # Lemmatization
    review = [lm.lemmatize (claim) for claim in review if claim not in stopwords]
    review = " ".join(review)
    preprocessed_list.append(review)

In [37]:
len(preprocessed_list)

2609

In [38]:
sona_df['claims'][0]

'The economy inherited was in a poor state with real GDP growth in 2000 at 3.7% lower than the projected 5%'

In [39]:
preprocessed_list[0]

'economy inherited poor state real gdp growth 2000 3 7 lower projected 5'

In [49]:
# Vectorization(converting text into vecotor data)
tf = TfidfVectorizer()
X_claims = tf.fit_transform(preprocessed_list).toarray()

# Create a sparse matrix for the encoded speakers
X_speaker = sona_df['encoded_speaker'].values.reshape(-1, 1)  # Reshape for sparse matrix compatibility

# Convert the encoded speaker array to a sparse matrix

X_speaker_sparse = csr_matrix(X_speaker)

# Combine TF-IDF matrix and encoded speaker matrix
X_combined = hstack([X_claims, X_speaker_sparse])

In [51]:
y = sona_df['label']
y.head()
joblib.dump(tf, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [42]:
# Splitting the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size = 0.2 ,random_state = 42)

In [43]:
X_train.shape, len(y_train)

((2087, 5812), 2087)

In [44]:
X_test.shape, len(y_test)

((522, 5812), 522)

In [45]:
# Model Building
# Train RandomForestClassifier
rf = RandomForestClassifier(random_state=42,class_weight='balanced')
rf.fit(X_train,y_train)

In [46]:
# Train a Logistic Regression Classifier
lo_mo = LogisticRegression(random_state=42, class_weight='balanced')  # Increase max_iter if necessary for convergence
lo_mo.fit(X_train, y_train)

In [47]:
# Train a Decision Tree Classifier
deci_tree_mo = DecisionTreeClassifier(random_state=42, class_weight='balanced')
deci_tree_mo.fit(X_train, y_train)

In [48]:
joblib.dump(rf, 'random_forest_model.pkl')
joblib.dump(lo_mo, 'logistic_regression_model.pkl')
joblib.dump(deci_tree_mo, 'decision_tree_model.pkl')

['decision_tree_model.pkl']

In [345]:
# Model Evaluation
y_predict = rf.predict(X_test)
sona_accuracy_score = accuracy_score(y_test,y_predict)
# sona_accuracy_score

In [346]:
class Evaluation:

    def __init__(self,model,x_train,x_test,y_train,y_test):
        self.model = model
        self.x_train = x_train
        self.x_test = x_test
        self.y_train = y_train
        self.y_test = y_test

    def train_evaluation(self):
        y_pred_train = self.model.predict(self.x_train)

        accu_scr_train = accuracy_score(self.y_train,y_pred_train)
        print('Accuracy Score On trainng',accu_scr_train)
        print()

        con_mat_train = confusion_matrix(self.y_train,y_pred_train)
        print('Confusion Matix on Training Data Set:\n',con_mat_train)
        print()

        class_rep_train = classification_report(self.y_train, y_pred_train)
        print('Classification Report On Training Data Set:\n',class_rep_train)
        print()

    def test_evaluation(self):
        y_pred_test = self.model.predict(self.x_test)

        accu_scr_train = accuracy_score(self.y_test,y_pred_test)
        print('Accuracy Score On Testing Data Set',accu_scr_train)
        print()
        
        con_mat_train = confusion_matrix(self.y_test,y_pred_test)
        print('Confusion Matix on Testing Data Set:\n',con_mat_train)
        print()
        
        class_rep_train = classification_report(self.y_test, y_pred_test)
        print('Classification Report On Testing Data Set:\n',class_rep_train)

In [347]:
# Random forest Evaluation Training Evaluation
Evaluation(rf,X_train,X_test,y_train,y_test).train_evaluation()

Accuracy Score On trainng 0.9908960229995208

Confusion Matix on Training Data Set:
 [[  95    0]
 [  19 1973]]

Classification Report On Training Data Set:
               precision    recall  f1-score   support

           0       0.83      1.00      0.91        95
           1       1.00      0.99      1.00      1992

    accuracy                           0.99      2087
   macro avg       0.92      1.00      0.95      2087
weighted avg       0.99      0.99      0.99      2087




In [348]:
# Random Forest Testing Evaluation
Evaluation(rf,X_train,X_test,y_train,y_test).test_evaluation()

Accuracy Score On Testing Data Set 0.9329501915708812

Confusion Matix on Testing Data Set:
 [[  1  24]
 [ 11 486]]

Classification Report On Testing Data Set:
               precision    recall  f1-score   support

           0       0.08      0.04      0.05        25
           1       0.95      0.98      0.97       497

    accuracy                           0.93       522
   macro avg       0.52      0.51      0.51       522
weighted avg       0.91      0.93      0.92       522



In [349]:
# Logistic  Regression Training Evaluation
Evaluation(lo_mo,X_train,X_test,y_train,y_test).train_evaluation()

Accuracy Score On trainng 0.9693339722089123

Confusion Matix on Training Data Set:
 [[  95    0]
 [  64 1928]]

Classification Report On Training Data Set:
               precision    recall  f1-score   support

           0       0.60      1.00      0.75        95
           1       1.00      0.97      0.98      1992

    accuracy                           0.97      2087
   macro avg       0.80      0.98      0.87      2087
weighted avg       0.98      0.97      0.97      2087




In [350]:
# Logistic Regression Testing Evaluation
Evaluation(lo_mo,X_train,X_test,y_train,y_test).test_evaluation()

Accuracy Score On Testing Data Set 0.9157088122605364

Confusion Matix on Testing Data Set:
 [[  3  22]
 [ 22 475]]

Classification Report On Testing Data Set:
               precision    recall  f1-score   support

           0       0.12      0.12      0.12        25
           1       0.96      0.96      0.96       497

    accuracy                           0.92       522
   macro avg       0.54      0.54      0.54       522
weighted avg       0.92      0.92      0.92       522



In [351]:
# Decision Tree Training Evaluation
Evaluation(deci_tree_mo,X_train,X_test,y_train,y_test).train_evaluation()

Accuracy Score On trainng 0.9908960229995208

Confusion Matix on Training Data Set:
 [[  95    0]
 [  19 1973]]

Classification Report On Training Data Set:
               precision    recall  f1-score   support

           0       0.83      1.00      0.91        95
           1       1.00      0.99      1.00      1992

    accuracy                           0.99      2087
   macro avg       0.92      1.00      0.95      2087
weighted avg       0.99      0.99      0.99      2087




In [352]:
# Decision Tree Testing Evaluation
Evaluation(deci_tree_mo,X_train,X_test,y_train,y_test).test_evaluation()

Accuracy Score On Testing Data Set 0.8295019157088123

Confusion Matix on Testing Data Set:
 [[  6  19]
 [ 70 427]]

Classification Report On Testing Data Set:
               precision    recall  f1-score   support

           0       0.08      0.24      0.12        25
           1       0.96      0.86      0.91       497

    accuracy                           0.83       522
   macro avg       0.52      0.55      0.51       522
weighted avg       0.92      0.83      0.87       522



In [353]:
# class Preprocessing:

#     def __init__(self,speaker,data):
#         self.data = data
#         self.speaker = speaker

#     def text_preprocessing_user(self):
#         lm = WordNetLemmatizer()
#         pred_data = [self.data]
#         preprocess_data = []
#         label_encoder = LabelEncoder()
#         speakr = [self.speaker]
#         speakr = label_encoder.fit_transform(speakr)
#         for data in pred_data:
#             review = re.sub(r'[^a-zA-Z0-9]',' ',data)
#             review = review.lower()
#             review = review.split()
#             review = [lm.lemmatize(x) for x in review if x not in stopwords]
#             review = " ".join(review)
#             preprocess_data.append(review)
#         return speakr, preprocess_data

In [354]:
class Preprocessing:

    def __init__(self, speaker, data):
        self.data = data
        self.speaker = speaker

    def text_preprocessing_user(self):
        lm = WordNetLemmatizer()
        pred_data = [self.data]
        preprocess_data = []
        label_encoder = LabelEncoder()
        speakr = [self.speaker]
        speakr = label_encoder.fit_transform(speakr)
        
        # change stopwords to nltk's stopwords
        # stopwords = set(nltk_stopwords.words('english'))

        for data in pred_data:
            review = re.sub(r'[^a-zA-Z0-9]', ' ', data)
            review = review.lower()
            review = review.split()
            review = [lm.lemmatize(x) for x in review if x not in stopwords]
            review = " ".join(review)
            preprocess_data.append(review)
        
        return speakr, preprocess_data

    # New function to predict whether the claim is true or false
    def predict_claim(self, model):
        # Step 1: Preprocess the input data
        speakr, preprocessed_data = self.text_preprocessing_user()

        
        
        # Load your vectorizer 
    
        tf = TfidfVectorizer()
        X_claims = tf.fit_transform(preprocessed_list).toarray()

        # Create a sparse matrix for the encoded speakers
        X_speaker = sona_df['encoded_speaker'].values.reshape(-1, 1)  
        
        # Convert the encoded speaker array to a sparse matrix

        X_speaker_sparse = csr_matrix(X_speaker)

        # Combine TF-IDF matrix and encoded speaker matrix
        X_combined = hstack([X_claims, X_speaker_sparse])
        
        # Transform the preprocessed 
        transformed_data = X_combined

        prediction = model.predict(transformed_data)

        # Assuming the model returns 0 or 1, map that to "True" or "False"
        claim_status = "True" if prediction[0] == 1 else "False"

        # Step 4: Return the speaker label and the claim status
        return speakr[0], claim_status        

In [355]:
sona_df['claims'][1]


'The total debt stock stood at 41.1 trillion cedis at the end of December 2000 with 31.7 trillion external and 9.4 trillion domestic'

In [356]:
speaker, claim_processed = Preprocessing('H.E John Agyekum Kufour',sona_df['claims'][1]).text_preprocessing_user()

In [357]:
Preprocessing(speaker[0],claim_processed[0]).predict_claim(deci_tree_mo)

(0, 'True')