Because of several problems training the model, we want to know what where the problem can be found, by looking at datatype for each column.  

In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('features_dataset.csv')

# Print data types for each column
print(df.dtypes)

label                    int64
TTR                    float64
text_polarity          float64
text_subjectivity      float64
title_polarity         float64
title_subjectivity     float64
doc_perplexity         float64
1_grams                 object
average_word_length    float64
dtype: object


it can be identified from the code above, that '1_grams' column are object datatype which means it include letters. Therefore we need to vectorize the column to "float", which means it convert letters to numbers

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the vectorizer
vectorizer = CountVectorizer()

# Vectorize the text data directly
X_vectorized = vectorizer.fit_transform(df['1_grams'])


# Drop the non-numeric column and convert DataFrame to a sparse DataFrame to concatenate with X_vectorized
df_dropped = df.drop('1_grams', axis=1).astype(float)  # Convert other columns to float
from scipy.sparse import csr_matrix, hstack
df_dropped_sparse = csr_matrix(df_dropped.values)  # Convert DataFrame to sparse matrix
X_final = hstack([df_dropped_sparse, X_vectorized])  # Concatenate the matrices



In [15]:

# Function to safely convert columns to numeric and check for negative values
def check_negatives(dataframe):
    negative_counts = {}
    for column in dataframe.columns:
        # Convert column to numeric, non-convertible values become NaN
        converted_column = pd.to_numeric(dataframe[column], errors='coerce')
        # Count negative values
        count = (converted_column < 0).sum()
        if count > 0:
            negative_counts[column] = count
    return negative_counts

# Apply the function to your DataFrame
negative_columns = check_negatives(df)

# Print the columns with negative values and their counts
print("Columns with negative values and their counts:")
for column, count in negative_columns.items():
    print(f"{column}: {count} negatives")


Columns with negative values and their counts:
text_polarity: 3097 negatives
title_polarity: 3117 negatives


In [16]:
def check_negatives_exclude_non_numeric(dataframe):
    negative_counts = {}
    for column in dataframe.columns:
        # Attempt to convert column to numeric
        converted_column = pd.to_numeric(dataframe[column], errors='coerce')
        # Continue only if the column is numeric
        if converted_column.dtype != 'object':
            # Count negative values
            count = (converted_column < 0).sum()
            if count > 0:
                negative_counts[column] = count
    return negative_counts


In [17]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd

# Load the dataset
df = pd.read_csv('features_dataset.csv')  # Adjust the path as necessary

# Convert all columns to numeric, handling non-numeric gracefully
df = df.apply(pd.to_numeric, errors='coerce')

# Fill NaN values that result from conversion errors
df.fillna(0, inplace=True)

# Initialize the scaler
scaler = MinMaxScaler()

# Fit and transform the data to scale each feature to [0, 1] range
df_scaled = scaler.fit_transform(df.drop('label', axis=1))

# Split the data into features and target
X = df_scaled
y = df['label']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Multinomial Naive Bayes classifier
model = MultinomialNB()

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_mat = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_mat)
print("Classification Report:\n", report)


Accuracy: 0.6176364293458619
Confusion Matrix:
 [[954 399]
 [659 755]]
Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.71      0.64      1353
           1       0.65      0.53      0.59      1414

    accuracy                           0.62      2767
   macro avg       0.62      0.62      0.62      2767
weighted avg       0.62      0.62      0.62      2767



lets try to optimize the model using GridSearchCV

In [10]:
from sklearn.model_selection import GridSearchCV

# Parameters grid to search
param_grid = {'alpha': [0.01, 0.1, 1, 10, 100]}

# Initialize the GridSearchCV object
grid_search = GridSearchCV(MultinomialNB(), param_grid, cv=5, scoring='accuracy')

# Fit it to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))


Best parameters: {'alpha': 10}
Best cross-validation score: 0.63


In [11]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

# Assuming df is your DataFrame and you have already vectorized your features into X and extracted y

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the classifier with the best parameters found
best_alpha = grid_search.best_params_['alpha']
model = MultinomialNB(alpha=best_alpha)

# Train the model
model.fit(X_train, y_train)

# Predict on the test data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_mat = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_mat)
print("Classification Report:\n", report)


Accuracy: 0.6176364293458619
Confusion Matrix:
 [[954 399]
 [659 755]]
Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.71      0.64      1353
           1       0.65      0.53      0.59      1414

    accuracy                           0.62      2767
   macro avg       0.62      0.62      0.62      2767
weighted avg       0.62      0.62      0.62      2767



In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Plotting confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()