In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
from joblib import dump
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [28]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [29]:
data = pd.read_csv("/content/drive/MyDrive/dataset1.csv")


In [30]:
# Text preprocessing: Tokenization, stemming, and lemmatization
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [31]:
def preprocess_text(text):
    words = nltk.word_tokenize(text)
    words = [word.lower() for word in words if word.isalpha()]  # Remove non-alphabetic characters
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    words = [stemmer.stem(word) for word in words]  # Stemming
    words = [lemmatizer.lemmatize(word) for word in words]  # Lemmatization
    return ' '.join(words)

In [32]:
data['processed_text'] = data['text'].apply(preprocess_text)

# Separate features (text) and label
X = data['processed_text']
y = data['label']

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [34]:
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_bow)
X_test_bow = vectorizer.transform(X_test)
X_test_tfidf = tfidf_transformer.fit_transform(X_test_bow)

In [35]:
classifier = MultinomialNB()
classifier.fit(X_train_tfidf, y_train)


In [36]:
y_pred = classifier.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.83


In [37]:
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.69      0.81       244
           1       0.75      0.97      0.85       228

    accuracy                           0.83       472
   macro avg       0.86      0.83      0.83       472
weighted avg       0.86      0.83      0.83       472



In [38]:
# pickle.dump(vectorizer, open("vectorizer.pkl", "wb"))
# vectorizer=pickle.load(open('vectorizer.pkl',"rb"))
# pickle.dump(classifier,open('model.pkl','wb'))
# model=pickle.load(open('model.pkl','rb'))
dump(classifier, 'model.joblib')
dump(vectorizer, 'vectorizer.joblib')

['vectorizer.joblib']