In [1]:
# import pickle to save the model and processed data
import pickle as pkl

from sklearn.model_selection import train_test_split


from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.svm import SVC

from sklearn.metrics import classification_report


import pandas as pd
import numpy as np

In [17]:
data = pd.read_csv('DataSets/Processed_Data_small.csv',delimiter = ',') #read the data

In [18]:
data.dropna(inplace = True) #drop null value records
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 399852 entries, 0 to 399999
Data columns (total 7 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   generated           399852 non-null  float64
 1   text                399852 non-null  object 
 2   avg_len_sentences   399852 non-null  float64
 3   words_count         399852 non-null  int64  
 4   punctuations_count  399852 non-null  int64  
 5   readability_score   399852 non-null  float64
 6   stop_word_ratio     399852 non-null  float64
dtypes: float64(4), int64(2), object(1)
memory usage: 24.4+ MB


In [20]:
#seperate features and labels
X=data[['text','avg_len_sentences','punctuations_count','words_count','stop_word_ratio','readability_score']]
y=data['generated']

In [21]:
#split the train and test data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [22]:
text_pipeline = Pipeline([
    ('count_vectorizer', CountVectorizer(ngram_range=(1, 2))),  # Step 1: CountVectorizer
    ('tfidf_transformer', TfidfTransformer( ))  # Step 2: TF-IDF Transformation
])

numerical_pipeline = Pipeline([
    ('scaler', MinMaxScaler())  # Scale numerical features
])

# Combine text and numerical pipelines using ColumnTransformer
preprocessor = ColumnTransformer([
    ('text', text_pipeline, 'text'),  # Apply text pipeline to 'text' column
    ('numerical', numerical_pipeline, ['avg_len_sentences', 'punctuations_count','readability_score','words_count','stop_word_ratio'])  # Apply numerical pipeline
])

# Final pipeline with preprocessor and model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', MultinomialNB()) 
])

In [23]:
#fit the naive bayes classifier model in the pipeline for training data
pipeline.fit(X_train, y_train)

In [24]:
#predict the test data
y_pred = pipeline.predict(X_test)

In [25]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.92      0.73      0.81     60081
         1.0       0.78      0.94      0.85     59875

    accuracy                           0.83    119956
   macro avg       0.85      0.83      0.83    119956
weighted avg       0.85      0.83      0.83    119956



In [26]:
#save the model
with open('NaiveBayesClassifier.pkl', 'wb') as file:
    pkl.dump(pipeline, file)

In [27]:
#seperate pipeline for logistic regression
pipeline2 = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())  # Replace with any model
])

In [28]:
pipeline2.fit(X_train, y_train)

In [29]:
y_pred= pipeline2.predict(X_test)

In [30]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.94      0.89      0.91     60081
         1.0       0.90      0.94      0.92     59875

    accuracy                           0.92    119956
   macro avg       0.92      0.92      0.92    119956
weighted avg       0.92      0.92      0.92    119956



In [31]:
with open('LogisticRegressionClassifier2.pkl', 'wb') as file:
    pkl.dump(pipeline2, file)