In [5]:
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC

# Define the directories for the two classes
melone_dir = 'melone'
benone_dir = 'benone'

# Load the text files into a DataFrame
data = pd.DataFrame(columns=['Content', 'Type'])
for file in os.listdir(melone_dir):
    if file.endswith('.txt'):
        with open(os.path.join(melone_dir, file), 'r') as f:
            content = f.read()
            new_row = pd.DataFrame({'Content': [content], 'Type': ['Melone']})
            data = pd.concat([data, new_row], ignore_index=True)
for file in os.listdir(benone_dir):
    if file.endswith('.txt'):
        with open(os.path.join(benone_dir, file), 'r') as f:
            content = f.read()
            new_row = pd.DataFrame({'Content': [content], 'Type': ['Benone']})
            data = pd.concat([data, new_row], ignore_index=True)

# Shuffle the DataFrame rows
data = data.sample(frac=1).reset_index(drop=True)

# Create a CountVectorizer object and fit it on the content of the text files
cv = CountVectorizer()
X = cv.fit_transform(data['Content'])
y = data['Type']


In [6]:
from nltk.corpus import stopwords
import re

stop_words = set(stopwords.words('english'))
data['Content'] = data['Content'].apply(lambda x: re.sub('[^a-zA-Z0-9\s]', '', x))
data['Content'] = data['Content'].apply(lambda x: x.lower())
data['Content'] = data['Content'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

In [15]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np


# Create a CountVectorizer object and fit the data
cv = CountVectorizer()
cv.fit_transform(data['Content'])

# Transform the text data into a matrix of features
X = cv.transform(data['Content'])
y = np.array(data['Type'])

# Apply chi-square feature selection
ch2 = SelectKBest(chi2, k=23)
X = ch2.fit_transform(X, y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Train a Linear SVM classifier on the selected features
clf = LinearSVC()
clf.fit(X_train, y_train)

# Evaluate the classifier on the testing set
y_pred = clf.predict(X_test)

# Calculate the classification metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# Print the classification metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)


Accuracy: 0.9979859013091642
Precision: 0.9965159361711086
Recall: 0.9965159361711086
F1-Score: 0.9965159361711086




In [16]:
import time

start_time = time.time()
clf.predict(X_test)
end_time = time.time()
print("Time taken: ", end_time - start_time, " seconds")


Time taken:  0.001999378204345703  seconds


In [4]:
# import warnings
# from sklearn.exceptions import ConvergenceWarning

# bestaccuracy = 0
# import numpy as np

# arr = np.arange(0.2, 0.3+0.025, 0.025)
# # [0.1   0.125 0.15  0.175 0.2   0.225 0.25  0.275 0.3  ]

# cv = CountVectorizer()
# cv.fit_transform(data['Content'])

# # Transform the text data into a matrix of features
# X = cv.transform(data['Content'])
# y = np.array(data['Type'])

# for i in range(1,1000):
#     for j in arr:
#         ch2 = SelectKBest(chi2, k=i)
#         Xi = ch2.fit_transform(X, y)

#         # Split the dataset into training and testing sets
#         X_train, X_test, y_train, y_test = train_test_split(Xi, y, test_size=j, random_state=42)

#         # Train a Linear SVM classifier on the selected features
#         with warnings.catch_warnings():
#             warnings.filterwarnings("ignore", category=ConvergenceWarning)
#             clf = LinearSVC()
#             clf.fit(X_train, y_train)

#         # Evaluate the classifier on the testing set
#         accuracy = clf.score(X_test, y_test)
#         if bestaccuracy  < accuracy:
#             bestaccuracy = accuracy
#             print("The best:", bestaccuracy, 'fnumber ', i, "size of ds", j)


The best: 0.8176100628930818 fnumber  1 size of ds 0.2
The best: 0.8187311178247734 fnumber  1 size of ds 0.25
The best: 0.8313758389261745 fnumber  1 size of ds 0.3
The best: 0.9547169811320755 fnumber  4 size of ds 0.2
The best: 0.962248322147651 fnumber  4 size of ds 0.3
The best: 0.9642058165548099 fnumber  5 size of ds 0.225
The best: 0.9789569990850869 fnumber  5 size of ds 0.275
The best: 0.989937106918239 fnumber  13 size of ds 0.2
The best: 0.9908508691674291 fnumber  13 size of ds 0.275
The best: 0.9911949685534591 fnumber  14 size of ds 0.2
The best: 0.9932885906040269 fnumber  14 size of ds 0.225
The best: 0.9944071588366891 fnumber  16 size of ds 0.225
The best: 0.9969788519637462 fnumber  16 size of ds 0.25
The best: 0.9974842767295597 fnumber  23 size of ds 0.2
The best: 0.9988814317673378 fnumber  23 size of ds 0.225
The best: 1.0 fnumber  23 size of ds 0.25


In [4]:
# Predict labels for the testing set
from sklearn.metrics import confusion_matrix
y_pred = clf.predict(X_test)

# Compute the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Extract TP, TN, FP, FN from the confusion matrix
TP = cm[1, 1]
TN = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]

# Print the results
print("TP:", TP)
print("TN:", TN)
print("FP:", FP)
print("FN:", FN)


TP: 639
TN: 153
FP: 3
FN: 0
