In [15]:
import pandas as pd

# Load the dataset
file_path = r"C:\Users\aksha\OneDrive\Desktop\spam.csv"
df = pd.read_csv(file_path, encoding='latin-1')

# Display the first few rows of the dataset
print("First few rows of the dataset:")
df.head()

First few rows of the dataset:


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [33]:
# Display basic information about the dataset
print("\nBasic dataset information:")
df.info()



Basic dataset information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   label    0 non-null      float64
 1   message  5572 non-null   object 
dtypes: float64(1), object(1)
memory usage: 87.2+ KB


In [35]:
# Now rename the columns as needed
df.columns = ['label', 'message']

# Display class distribution
print("Class distribution:")
print(df['label'].value_counts())


Class distribution:
Series([], Name: count, dtype: int64)


In [21]:
# Renaming columns for clarity
df.columns = ['label', 'message']

# Dropping unnecessary columns (if any)
df.dropna(axis=1, how='all', inplace=True)

# Converting labels to binary values: 'ham' -> 0 and 'spam' -> 1
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Displaying the first few rows after preprocessing
print("First few rows after preprocessing:")
df.head()


First few rows after preprocessing:


Unnamed: 0,label,message
0,,"Go until jurong point, crazy.. Available only ..."
1,,Ok lar... Joking wif u oni...
2,,Free entry in 2 a wkly comp to win FA Cup fina...
3,,U dun say so early hor... U c already then say...
4,,"Nah I don't think he goes to usf, he lives aro..."


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')

# Fit and transform the message column
X = tfidf.fit_transform(df['message'])

# Extracting labels
y = df['label']

print("Shape of the feature matrix:", X.shape)


Shape of the feature matrix: (5572, 5000)


In [19]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the Naive Bayes model
nb_model = MultinomialNB()

# Train the model
nb_model.fit(X_train, y_train)

# Predict on the test set
y_pred = nb_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Model Accuracy: 0.9725

Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1453
           1       1.00      0.79      0.88       219

    accuracy                           0.97      1672
   macro avg       0.98      0.89      0.93      1672
weighted avg       0.97      0.97      0.97      1672



In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Initialize models
log_reg = LogisticRegression(max_iter=1000)
svc = SVC()

# List of models
models = {'Logistic Regression': log_reg, 'Support Vector Machine': svc}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy:.4f}")


Logistic Regression Accuracy: 0.9480
Support Vector Machine Accuracy: 0.9761


In [23]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid for Logistic Regression
param_grid = {'C': [0.1, 1, 10, 100]}

# Grid search for Logistic Regression
grid_search = GridSearchCV(log_reg, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best Parameters for Logistic Regression:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)


Best Parameters for Logistic Regression: {'C': 100}
Best Score: 0.9779487179487178


In [27]:
from flask import Flask, request, jsonify

app = Flask(__name__)

@app.route('/predict', methods=['POST'])
def predict():
    message = request.json['message']
    vectorized_message = tfidf.transform([message])
    prediction = nb_model.predict(vectorized_message)
    return jsonify({'prediction': 'spam' if prediction[0] else 'ham'})

if __name__ == '__main__':
    app.run(debug=True)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
 * Restarting with watchdog (windowsapi)


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
