In [1]:
!pip install nltk



In [2]:
import nltk

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import json
import nltk
from nltk.corpus import stopwords
import re
from google.colab import drive

drive.mount('/content/drive')
# nltk.download('stopwords')

def preprocess_text(text):
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = nltk.word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

file_path = '/content/drive/My Drive/dataset.json'

with open(file_path, 'r') as file:
    chit_chat_data = json.load(file)

# Load and preprocess novel dataset
chit_chat_texts = []
for key, value in chit_chat_data.items():
    messages = value.get("messages", [])
    for message_list in messages:
        # print(message_list)
        for message in message_list:
            text = message.get("text", "")
            # print(text)
            chit_chat_texts.append(text)

for i in range(len(chit_chat_texts)):
    chit_chat_texts[i] = preprocess_text(chit_chat_texts[i])

novel_texts = []
for i in range(1, 11):
    with open(f'novel_1 ({i}).txt', 'r', encoding='utf-8') as file:
        novel_text = file.read()
        # Split novel text into paragraphs (you may need to adjust this based on the actual structure)
        paragraphs = novel_text.split('.')  # Adjust the delimiter based on your text structure
        novel_texts.extend(paragraphs)

for i in range(len(novel_texts)):
    novel_texts[i] = preprocess_text(novel_texts[i])

# Create labels for the datasets (1 for chit-chat, 0 for novel)

labels_chit_chat = [1] * len(chit_chat_texts)
labels_novel = [0] * len(novel_texts)
print('labels_chit_chat : ', len(labels_chit_chat))
print('labels_novel : ', len(labels_novel))
# Combine datasets
all_texts = chit_chat_texts + novel_texts
all_labels = labels_chit_chat + labels_novel

print('Split the data into training and testing sets')
X_train, X_test, y_train, y_test = train_test_split(all_texts, all_labels, test_size=0.001, random_state=42)

print('Convert text data to TF-IDF features')
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print('Train a Support Vector Machine (SVM) classifier')
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train_tfidf, y_train)

print('# Make predictions on the test set')
y_pred = svm_classifier.predict(X_test_tfidf)

print('Evaluate the model')
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
labels_chit_chat :  258145
labels_novel :  18718
Split the data into training and testing sets
Convert text data to TF-IDF features
Train a Support Vector Machine (SVM) classifier
# Make predictions on the test set
Evaluate the model
Accuracy: 0.97
              precision    recall  f1-score   support

           0       0.92      0.65      0.76        17
           1       0.98      1.00      0.99       260

    accuracy                           0.97       277
   macro avg       0.95      0.82      0.87       277
weighted avg       0.97      0.97      0.97       277



In [10]:
new_texts = ["did you eat yesterday", "how are you doing"]
# new_texts[0] = preprocess_text(new_texts[0])
new_texts_tfidf = tfidf_vectorizer.transform(new_texts)

predictions = svm_classifier.predict(new_texts_tfidf)

# Display the predictions
print(predictions)

[1 1]


In [9]:
import joblib  # For scikit-learn versions < 0.24

# Save the trained model to a pickle file
model_filename = '/content/drive/My Drive/svm_model_new.pkl'
joblib.dump(svm_classifier, model_filename)
joblib.dump(tfidf_vectorizer, '/content/drive/My Drive/tfidf_vectorizer.pkl')
print(f"Trained model saved to {model_filename}")


Trained model saved to /content/drive/My Drive/svm_model_new.pkl


['/content/drive/My Drive/tfidf_vectorizer.pkl']