In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

# Step 1: Load data from csv file
df = pd.read_csv('test.csv')

# Step 2: Text processing and cleaning for Body column
df['Body'] = df['Body'].str.lower() # convert text to lowercase
df['Body'] = df['Body'].str.replace(r'[^\w\s]+', '') # remove special characters
df['Body'] = df['Body'].str.replace(r'\d+', '') # remove numbers

# Step 3: TF-IDF vectorization using Sklearn
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['Body'])

# Step 4: Train binary classification models with train-test split
y = df['Footer'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Logistic Regression
clf_lr = LogisticRegression()
clf_lr.fit(X_train, y_train)

# Gaussian Naive Bayes
clf_gnb = GaussianNB()
clf_gnb.fit(X_train, y_train)

# Support Vector Machine
clf_svm = SVC()
clf_svm.fit(X_train, y_train)

# XGBoost
clf_xgb = XGBClassifier()
clf_xgb.fit(X_train, y_train)

# KNN
clf_knn = KNeighborsClassifier()
clf_knn.fit(X_train, y_train)

# Validation
print("\nLogistic Regression")
print("\n-----------------------\n")
y_pred = clf_lr.predict(X_test)
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='binary')
print("Accuracy:", clf_lr.score(X_test, y_test))
print("Precision:", precision)
print("Recall:", recall)
print("F-Score:", fscore)

print("\nGaussian Naive Bayes")
print("\n-----------------------\n")
y_pred = clf_gnb.predict(X_test)
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='binary')
print("Accuracy:", clf_gnb.score(X_test, y_test))
print("Precision:", precision)
print("Recall:", recall)

print("F-Score:", fscore)

print("\nSupport Vector Machine")
print("\n-----------------------\n")
y_pred = clf_svm.predict(X_test)
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='binary')
print("Accuracy:", clf_svm.score(X_test, y_test))
print("Precision:", precision)
print("Recall:", recall)
print("F-Score:", fscore)

print("\nXGBoost")
print("\n-----------------------\n")
y_pred = clf_xgb.predict(X_test)
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='binary')
print("Accuracy:", clf_xgb.score(X_test, y_test))
print("Precision:", precision)
print("Recall:", recall)
print("F-Score:", fscore)

print("\nKNN")
print("\n-----------------------\n")
y_pred = clf_knn.predict(X_test)
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='binary')
print("Accuracy:", clf_xgb.score(X_test, y_test))
print("Precision:", precision)
print("Recall:", recall)
print("F-Score:", fscore)



In [None]:
def extract_features(data):
    # Extracting the length of the text
    data['text_length'] = data['Body'].apply(len)
    
    # Extracting the number of special characters
    data['special_chars'] = data['Body'].apply(lambda x: len([i for i in x if i in string.punctuation]))
    
    return data

# Loading the dataframe
df = pd.read_csv("test.csv")

# Processing and cleaning the text
df['Body'] = df['Body'].apply(text_processing)

# Adding the additional features to the data
df = extract_features(df)

# Defining the feature and target variables
X = df[['text_length', 'special_chars']]
y = df['Footer']

# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Performing TF-IDF vectorization
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train['Body'])
X_test = vectorizer.transform(X_test['Body'])

# Adding the additional features to the vectors
X_train = hstack((X_train, np.array(df_train[['text_length', 'special_chars']])))
X_test = hstack((X_test, np.array(df_test[['text_length', 'special_chars']])))

# Training the classifiers
clf_lr = LogisticRegression(random_state=0)
clf_lr.fit(X_train, y_train)

clf_svm = SVC(random_state=0)
clf_svm.fit(X_train, y_train)

clf_xgb = XGBClassifier(random_state=0)
clf_xgb.fit(X_train, y_train)

clf_knn = KNeighborsClassifier()
clf_knn.fit(X_train, y_train)

# Printing the accuracy, precision, recall, and f-score for each classifier
print("Logistic Regression")
y_pred = clf_lr.predict(X_test)
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='binary')
print("Accuracy:", clf_lr.score(X_test, y_test))
print("Precision:", precision)
print("Recall:", recall)
print("F-Score:", fscore)

print("\nSupport Vector Machine")
y_pred = clf_svm.predict(X_test)
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='binary')
print("Accuracy:", clf_svm.score(X_test, y_test))
print("Precision:", precision)
print("Recall:", recall)
print("F-Score:", fscore)



In [None]:
def text_processing(text):
    # Converting to lowercase
    text = text.lower()
    
    # Removing numbers
    text = re.sub(r'\d+', '', text)
    
    # Removing punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    
    # Stemming
    stemmer = FrenchStemmer()
    text = " ".join([stemmer.stem(word) for word in text.split()])
    
    # Removing stop words
    stop_words = set(stopwords.words("english") + stopwords.words("french"))
    text = " ".join([word for word in text.split() if word not in stop_words])
    
    return text

# Loading the dataframe
df = pd.read_csv("test.csv")

# Processing and cleaning the text
df['Body'] = df['Body'].apply(text_processing)

# Defining the feature and target variables
X = df[['Body']]
y = df['Footer']

# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Performing TF-IDF vectorization
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train['Body'])
X_test = vectorizer.transform(X_test['Body'])

# Training the classifiers
clf_lr = LogisticRegression(random_state=0)
clf_lr.fit(X_train, y_train)

clf_svm = SVC(random_state=0)
clf_svm.fit(X_train, y_train)

clf_xgb = XGBClassifier(random_state=0)
clf_xgb.fit(X_train, y_train)

clf_knn = KNeighborsClassifier()
clf_knn.fit(X_train, y_train)

# Printing the accuracy, precision, recall, and f-score for each classifier
print("Logistic Regression")
y_pred = clf_lr.predict(X_test)
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='binary')
print("Accuracy:", clf_lr.score(X_test, y_test))
print("Precision:", precision)
print("Recall:", recall)
print("F-Score:", fscore)

print("\nSupport Vector Machine")
y_pred = clf_svm.predict(X_test)
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='binary')
print("Accuracy:", clf_svm.score(X_test, y_test))
print("Precision:", precision)
print("Recall:", recall)
print("F-Score:", fscore)

print("\nXGBoost")


In [None]:
import pandas as pd
import re
import string
import numpy as np
from nltk.stem import FrenchStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support