In [6]:

# Using Naive Bayes

import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Step 1: Load the datasets
train_df = pd.read_csv("train.csv")
val_df = pd.read_csv("val.csv")

# Step 2: Preprocess the data
# Drop any rows with missing values
train_df = train_df.dropna()
val_df = val_df.dropna()

# Encode categorical variables
label_encoder = LabelEncoder()
train_df['label'] = label_encoder.fit_transform(train_df['label'])
val_df['label'] = label_encoder.transform(val_df['label'])

# Separate features (X) and target labels (y)
X_train = train_df.drop(columns=['label'])
y_train = train_df['label']

X_val = val_df.drop(columns=['label'])
y_val = val_df['label']

# Step 3: Preprocess text data (if applicable)
# Assuming the text data is in a column named 'text_column'
text_vectorizer = TfidfVectorizer()  # Use TF-IDF vectorizer for text data
X_train_text = text_vectorizer.fit_transform(X_train['text'])
X_val_text = text_vectorizer.transform(X_val['text'])

# Combine text features with other features (if any)
X_train_combined = X_train_text
X_val_combined = X_val_text

# Step 4: Train the Naive Bayes model
naive_bayes_model = MultinomialNB()  # Using Multinomial Naive Bayes for text classification
naive_bayes_model.fit(X_train_combined, y_train)

# Step 5: Evaluate the model on the validation set
val_preds = naive_bayes_model.predict(X_val_combined)

# Compute evaluation metrics
accuracy = accuracy_score(y_val, val_preds)
precision = precision_score(y_val, val_preds, average='weighted')
recall = recall_score(y_val, val_preds, average='weighted')
f1 = f1_score(y_val, val_preds, average='weighted')
conf_matrix = confusion_matrix(y_val, val_preds)

print("Validation Set Metrics:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")
print("Confusion Matrix:")
print(conf_matrix)


Validation Set Metrics:
Accuracy: 0.8507215641648994
Precision: 0.8691276045965041
Recall: 0.8507215641648994
F1-score: 0.8339674185074238
Confusion Matrix:
[[ 5649  5559]
 [  213 27245]]


In [5]:
print(train_df.columns)
print(val_df.columns)

Index(['text', 'label'], dtype='object')
Index(['text', 'label'], dtype='object')


In [7]:

# Deep Feed Forward Neural Networks

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam

# Step 1: Load the datasets
train_df = pd.read_csv("train.csv")
val_df = pd.read_csv("val.csv")

# Step 2: Preprocess the data
# Drop any rows with missing values
train_df = train_df.dropna()
val_df = val_df.dropna()

# Encode categorical variables
label_encoder = LabelEncoder()
train_df['label'] = label_encoder.fit_transform(train_df['label'])
val_df['label'] = label_encoder.transform(val_df['label'])

# Separate features (X) and target labels (y)
X_train = train_df['text']
y_train = train_df['label']

X_val = val_df['text']
y_val = val_df['label']

# Step 3: Preprocess text data
text_vectorizer = TfidfVectorizer()  # Use TF-IDF vectorizer for text data
X_train_text = text_vectorizer.fit_transform(X_train)
X_val_text = text_vectorizer.transform(X_val)

# Convert labels to one-hot encoding
num_classes = len(np.unique(y_train))
y_train_one_hot = np.eye(num_classes)[y_train]
y_val_one_hot = np.eye(num_classes)[y_val]

# Step 4: Build the neural network model
model = Sequential()
model.add(Dense(128, input_shape=(X_train_text.shape[1],), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))

# Compile the model # Adam Parameters 
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

# Step 5: Train the model
history = model.fit(X_train_text, y_train_one_hot, batch_size=32, epochs=10, validation_split=0.2)

# Step 6: Evaluate the model on the validation set
val_preds_prob = model.predict(X_val_text)
val_preds = np.argmax(val_preds_prob, axis=1)

# Compute evaluation metrics
accuracy = accuracy_score(y_val, val_preds)
precision = precision_score(y_val, val_preds, average='weighted')
recall = recall_score(y_val, val_preds, average='weighted')
f1 = f1_score(y_val, val_preds, average='weighted')
conf_matrix = confusion_matrix(y_val, val_preds)

print("Validation Set Metrics:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")
print("Confusion Matrix:")
print(conf_matrix)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m5801/5801[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1330s[0m 229ms/step - accuracy: 0.9134 - loss: 0.2192 - val_accuracy: 0.9528 - val_loss: 0.1247
Epoch 2/10
[1m5801/5801[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1207s[0m 208ms/step - accuracy: 0.9681 - loss: 0.0872 - val_accuracy: 0.9562 - val_loss: 0.1217
Epoch 3/10
[1m5801/5801[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1116s[0m 192ms/step - accuracy: 0.9805 - loss: 0.0552 - val_accuracy: 0.9587 - val_loss: 0.1293
Epoch 4/10
[1m5801/5801[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1152s[0m 199ms/step - accuracy: 0.9871 - loss: 0.0378 - val_accuracy: 0.9566 - val_loss: 0.1355
Epoch 5/10
[1m5801/5801[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1225s[0m 211ms/step - accuracy: 0.9904 - loss: 0.0285 - val_accuracy: 0.9583 - val_loss: 0.1597
Epoch 6/10
[1m5801/5801[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1224s[0m 211ms/step - accuracy: 0.9933 - loss: 0.0198 - val_accuracy: 0.9572 - val

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier

# Step 1: Load the datasets
try:
    train_df = pd.read_csv("train.csv")
    val_df = pd.read_csv("val.csv")
except FileNotFoundError:
    print("CSV files not found. Please check the file paths.")
    raise

# Step 2: Preprocess the data
# Drop any rows with missing values
train_df = train_df.dropna()
val_df = val_df.dropna()

# Inspect the data
print(f"Training data shape: {train_df.shape}")
print(f"Validation data shape: {val_df.shape}")

# Encode categorical variables
label_encoder = LabelEncoder()
train_df['label'] = label_encoder.fit_transform(train_df['label'])
val_df['label'] = label_encoder.transform(val_df['label'])

# Separate features (X) and target labels (y)
X_train = train_df['text']
y_train = train_df['label']

X_val = val_df['text']
y_val = val_df['label']

# Step 3: Preprocess text data
text_vectorizer = TfidfVectorizer()  # Use TF-IDF vectorizer for text data
X_train_text = text_vectorizer.fit_transform(X_train)
X_val_text = text_vectorizer.transform(X_val)

# Check the shape of the transformed data
print(f"TF-IDF transformed training data shape: {X_train_text.shape}")
print(f"TF-IDF transformed validation data shape: {X_val_text.shape}")

# Step 4: Build the Decision Tree model
dt_model = DecisionTreeClassifier()

# Train the model
dt_model.fit(X_train_text, y_train)

# Step 5: Evaluate the Decision Tree model on the validation set
dt_val_preds = dt_model.predict(X_val_text)

# Compute evaluation metrics for Decision Tree model
dt_accuracy = accuracy_score(y_val, dt_val_preds)
dt_precision = precision_score(y_val, dt_val_preds, average='weighted')
dt_recall = recall_score(y_val, dt_val_preds, average='weighted')
dt_f1 = f1_score(y_val, dt_val_preds, average='weighted')
dt_conf_matrix = confusion_matrix(y_val, dt_val_preds)

# Print the evaluation metrics
print(f"Decision Tree Model Accuracy: {dt_accuracy}")
print(f"Decision Tree Model Precision: {dt_precision}")
print(f"Decision Tree Model Recall: {dt_recall}")
print(f"Decision Tree Model F1 Score: {dt_f1}")
print("Decision Tree Model Confusion Matrix:")
print(dt_conf_matrix)


Training data shape: (232003, 2)
Validation data shape: (38666, 2)
TF-IDF transformed training data shape: (232003, 134785)
TF-IDF transformed validation data shape: (38666, 134785)
Decision Tree Model Accuracy: 0.8395748202555217
Decision Tree Model Precision: 0.8396898179389932
Decision Tree Model Recall: 0.8395748202555217
Decision Tree Model F1 Score: 0.839631863108545
Decision Tree Model Confusion Matrix:
[[ 8120  3088]
 [ 3115 24343]]


In [1]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Step 1: Load the datasets
train_df = pd.read_csv("train.csv")
val_df = pd.read_csv("val.csv")

# Step 2: Preprocess the data
train_df = train_df.dropna()
val_df = val_df.dropna()

# Encode categorical variables
label_encoder = LabelEncoder()
train_df['label'] = label_encoder.fit_transform(train_df['label'])
val_df['label'] = label_encoder.transform(val_df['label'])

# Separate features (X) and target labels (y)
X_train = train_df['text']
y_train = train_df['label']
X_val = val_df['text']
y_val = val_df['label']

# Step 3: Preprocess text data
text_vectorizer = TfidfVectorizer()
X_train_text = text_vectorizer.fit_transform(X_train)
X_val_text = text_vectorizer.transform(X_val)

# Naive Bayes Model
naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(X_train_text, y_train)
nb_val_preds = naive_bayes_model.predict(X_val_text)

# Decision Tree Model
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train_text, y_train)
dt_val_preds = dt_model.predict(X_val_text)

# Deep Feed Forward Neural Network Model
num_classes = len(np.unique(y_train))
input_dim = X_train_text.shape[1]

# Build the neural network model
nn_model = Sequential()
nn_model.add(Dense(128, input_dim=input_dim, activation='relu'))
nn_model.add(Dropout(0.5))
nn_model.add(Dense(64, activation='relu'))
nn_model.add(Dropout(0.5))
nn_model.add(Dense(num_classes, activation='softmax'))

# Compile the model
nn_model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

# Convert labels to one-hot encoding
y_train_one_hot = np.eye(num_classes)[y_train]

# Train the model
nn_model.fit(X_train_text, y_train_one_hot, batch_size=32, epochs=10, validation_split=0.2, verbose=0)

# Predicting with Neural Network
nn_val_preds_prob = nn_model.predict(X_val_text)
nn_val_preds = np.argmax(nn_val_preds_prob, axis=1)

# Combine predictions using majority voting
combined_preds = np.array([nb_val_preds, dt_val_preds, nn_val_preds])
ensemble_preds = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=combined_preds)

# Compute evaluation metrics for ensemble model
ensemble_accuracy = accuracy_score(y_val, ensemble_preds)
ensemble_precision = precision_score(y_val, ensemble_preds, average='weighted')
ensemble_recall = recall_score(y_val, ensemble_preds, average='weighted')
ensemble_f1 = f1_score(y_val, ensemble_preds, average='weighted')
ensemble_conf_matrix = confusion_matrix(y_val, ensemble_preds)

# Print the evaluation metrics for ensemble model
print("Ensemble Model Validation Set Metrics:")
print(f"Accuracy: {ensemble_accuracy}")
print(f"Precision: {ensemble_precision}")
print(f"Recall: {ensemble_recall}")
print(f"F1-score: {ensemble_f1}")
print("Confusion Matrix:")
print(ensemble_conf_matrix)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1209/1209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 7ms/step
Ensemble Model Validation Set Metrics:
Accuracy: 0.9273263332126416
Precision: 0.9295372914828022
Recall: 0.9273263332126416
F1-score: 0.9249628733057884
Confusion Matrix:
[[ 8773  2435]
 [  375 27083]]
