In [3]:
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

# Define the directories for the two classes
melone_dir = 'melone'
benone_dir = 'benone'

# Load the text files into a DataFrame
data = pd.DataFrame(columns=['Content', 'Type'])
for file in os.listdir(melone_dir):
    if file.endswith('.txt'):
        with open(os.path.join(melone_dir, file), 'r') as f:
            content = f.read()
            data = pd.concat([data, pd.DataFrame({'Content': [content], 'Type': ['Melone']})], ignore_index=True)
for file in os.listdir(benone_dir):
    if file.endswith('.txt'):
        with open(os.path.join(benone_dir, file), 'r') as f:
            content = f.read()
            data = pd.concat([data, pd.DataFrame({'Content': [content], 'Type': ['Benone']})], ignore_index=True)

# Shuffle the DataFrame rows
data = data.sample(frac=1).reset_index(drop=True)

# Create a CountVectorizer object and fit it on the content of the text files
from nltk.corpus import stopwords
import re

stop_words = set(stopwords.words('english'))
data['Content'] = data['Content'].apply(lambda x: re.sub('[^a-zA-Z0-9\s]', '', x))
data['Content'] = data['Content'].apply(lambda x: x.lower())
data['Content'] = data['Content'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# cv = CountVectorizer()
# cv.fit_transform(data['Content'])

# Transform the text data into a matrix of features
X = cv.transform(data['Content'])
y = np.array(data['Type'])

# Apply chi-square feature selection
# ch2 = SelectKBest(chi2, k=41)
# X = ch2.fit_transform(X, y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.299999, random_state=42)

# Train a Gradient Boosting classifier on the selected features
clf = GradientBoostingClassifier()
clf.fit(X_train, y_train)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# # Evaluate the classifier on the testing set
# y_pred = clf.predict(X_test)
# report = classification_report(y_test, y_pred)
# print("Classification Report:\n", report)


y_pred = clf.predict(X_test)

# Calculate the classification metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# Print the classification metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)


Accuracy: 0.9983221476510067
Precision: 0.997120647757632
Recall: 0.997120647757632
F1-Score: 0.997120647757632


In [5]:
import time

start_time = time.time()
clf.predict(X_test)
end_time = time.time()
print("Time taken: ", end_time - start_time, " seconds")


Time taken:  0.007978200912475586  seconds


In [5]:
# bestaccuracy = 0
# import numpy as np

# arr = np.arange(0.1, 0.3+0.025, 0.025)
# # [0.1   0.125 0.15  0.175 0.2   0.225 0.25  0.275 0.3  ]

# cv = CountVectorizer()
# cv.fit_transform(data['Content'])

# # Transform the text data into a matrix of features
# X = cv.transform(data['Content'])
# y = np.array(data['Type'])


# # Transform the text data into a matrix of features
# for i in range(1,1000):
#     for j in arr:
#         ch2 = SelectKBest(chi2, k=i)
#         Xi = ch2.fit_transform(X, y)

#         # Split the dataset into training and testing sets
#         X_train, X_test, y_train, y_test = train_test_split(Xi, y, test_size=j, random_state=42)

#         # Train a Linear SVM classifier on the selected features

#         clf = GradientBoostingClassifier()
#         clf.fit(X_train, y_train)

#         # Evaluate the classifier on the testing set
#         accuracy = clf.score(X_test, y_test)
#         if bestaccuracy  < accuracy:
#             bestaccuracy = accuracy
#             print("The best:", bestaccuracy, 'fnumber ', i, "size of ds", j)