In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

# Step 1: Load data from csv file
df = pd.read_csv('test.csv')

# Step 2: Text processing and cleaning for Body column
df['Body'] = df['Body'].str.lower() # convert text to lowercase
df['Body'] = df['Body'].str.replace(r'[^\w\s]+', '') # remove special characters
df['Body'] = df['Body'].str.replace(r'\d+', '') # remove numbers

# Step 3: TF-IDF vectorization using Sklearn
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['Body'])

# Step 4: Train binary classification models with train-test split
y = df['Footer'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Logistic Regression
clf_lr = LogisticRegression()
clf_lr.fit(X_train, y_train)

# Gaussian Naive Bayes
clf_gnb = GaussianNB()
clf_gnb.fit(X_train, y_train)

# Support Vector Machine
clf_svm = SVC()
clf_svm.fit(X_train, y_train)

# XGBoost
clf_xgb = XGBClassifier()
clf_xgb.fit(X_train, y_train)

# KNN
clf_knn = KNeighborsClassifier()
clf_knn.fit(X_train, y_train)

# Validation
print("\nLogistic Regression")
print("\n-----------------------\n")
y_pred = clf_lr.predict(X_test)
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='binary')
print("Accuracy:", clf_lr.score(X_test, y_test))
print("Precision:", precision)
print("Recall:", recall)
print("F-Score:", fscore)

print("\nGaussian Naive Bayes")
print("\n-----------------------\n")
y_pred = clf_gnb.predict(X_test)
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='binary')
print("Accuracy:", clf_gnb.score(X_test, y_test))
print("Precision:", precision)
print("Recall:", recall)

print("F-Score:", fscore)

print("\nSupport Vector Machine")
print("\n-----------------------\n")
y_pred = clf_svm.predict(X_test)
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='binary')
print("Accuracy:", clf_svm.score(X_test, y_test))
print("Precision:", precision)
print("Recall:", recall)
print("F-Score:", fscore)

print("\nXGBoost")
print("\n-----------------------\n")
y_pred = clf_xgb.predict(X_test)
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='binary')
print("Accuracy:", clf_xgb.score(X_test, y_test))
print("Precision:", precision)
print("Recall:", recall)
print("F-Score:", fscore)

print("\nKNN")
print("\n-----------------------\n")
y_pred = clf_knn.predict(X_test)
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='binary')
print("Accuracy:", clf_xgb.score(X_test, y_test))
print("Precision:", precision)
print("Recall:", recall)
print("F-Score:", fscore)



In [None]:
def extract_features(data):
    # Extracting the length of the text
    data['text_length'] = data['Body'].apply(len)
    
    # Extracting the number of special characters
    data['special_chars'] = data['Body'].apply(lambda x: len([i for i in x if i in string.punctuation]))
    
    return data

# Loading the dataframe
df = pd.read_csv("test.csv")

# Processing and cleaning the text
df['Body'] = df['Body'].apply(text_processing)

# Adding the additional features to the data
df = extract_features(df)

# Defining the feature and target variables
X = df[['text_length', 'special_chars']]
y = df['Footer']

# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Performing TF-IDF vectorization
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train['Body'])
X_test = vectorizer.transform(X_test['Body'])

# Adding the additional features to the vectors
X_train = hstack((X_train, np.array(df_train[['text_length', 'special_chars']])))
X_test = hstack((X_test, np.array(df_test[['text_length', 'special_chars']])))

# Training the classifiers
clf_lr = LogisticRegression(random_state=0)
clf_lr.fit(X_train, y_train)

clf_svm = SVC(random_state=0)
clf_svm.fit(X_train, y_train)

clf_xgb = XGBClassifier(random_state=0)
clf_xgb.fit(X_train, y_train)

clf_knn = KNeighborsClassifier()
clf_knn.fit(X_train, y_train)

# Printing the accuracy, precision, recall, and f-score for each classifier
print("Logistic Regression")
y_pred = clf_lr.predict(X_test)
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='binary')
print("Accuracy:", clf_lr.score(X_test, y_test))
print("Precision:", precision)
print("Recall:", recall)
print("F-Score:", fscore)

print("\nSupport Vector Machine")
y_pred = clf_svm.predict(X_test)
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='binary')
print("Accuracy:", clf_svm.score(X_test, y_test))
print("Precision:", precision)
print("Recall:", recall)
print("F-Score:", fscore)



In [None]:
def text_processing(text):
    # Converting to lowercase
    text = text.lower()
    
    # Removing numbers
    text = re.sub(r'\d+', '', text)
    
    # Removing punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    
    # Stemming
    stemmer = FrenchStemmer()
    text = " ".join([stemmer.stem(word) for word in text.split()])
    
    # Removing stop words
    stop_words = set(stopwords.words("english") + stopwords.words("french"))
    text = " ".join([word for word in text.split() if word not in stop_words])
    
    return text

# Loading the dataframe
df = pd.read_csv("test.csv")

# Processing and cleaning the text
df['Body'] = df['Body'].apply(text_processing)

# Defining the feature and target variables
X = df[['Body']]
y = df['Footer']

# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Performing TF-IDF vectorization
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train['Body'])
X_test = vectorizer.transform(X_test['Body'])

# Training the classifiers
clf_lr = LogisticRegression(random_state=0)
clf_lr.fit(X_train, y_train)

clf_svm = SVC(random_state=0)
clf_svm.fit(X_train, y_train)

clf_xgb = XGBClassifier(random_state=0)
clf_xgb.fit(X_train, y_train)

clf_knn = KNeighborsClassifier()
clf_knn.fit(X_train, y_train)

# Printing the accuracy, precision, recall, and f-score for each classifier
print("Logistic Regression")
y_pred = clf_lr.predict(X_test)
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='binary')
print("Accuracy:", clf_lr.score(X_test, y_test))
print("Precision:", precision)
print("Recall:", recall)
print("F-Score:", fscore)

print("\nSupport Vector Machine")
y_pred = clf_svm.predict(X_test)
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='binary')
print("Accuracy:", clf_svm.score(X_test, y_test))
print("Precision:", precision)
print("Recall:", recall)
print("F-Score:", fscore)

print("\nXGBoost")


In [None]:
import pandas as pd
import re
import string
import numpy as np
from nltk.stem import FrenchStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

In [None]:
import plotly.express as px
import pandas as pd

# Load the data
df = pd.read_csv('test.csv')

# Count the number of instances of each class
class_counts = df['Footer'].value_counts().to_dict()

# Create the pie chart
fig = px.pie(values=list(class_counts.values()),
             names=['NO Footer', 'Footer'],
             labels=['NO Footer' if key == 0 else 'Footer' for key in class_counts.keys()],
             title='Proportion of Footer vs NO Footer')

# Show the chart
fig.show()

In [None]:
import plotly.express as px
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk import ngrams
from wordcloud import WordCloud
import matplotlib.pyplot as plt


# Load the data
df = pd.read_csv('test.csv')

# Filter the rows with Footer value == 1
footer_rows = df[df['Footer'] == 1]

# Compile the text of the Body column in one corpus
corpus = ' '.join(footer_rows['Body'].tolist())

# Perform basic text processing on the corpus
corpus = re.sub(r'[^\w\s]', '', corpus)
corpus = corpus.lower()

# Tokenize the corpus into words
tokens = word_tokenize(corpus)

# Compute word frequency
word_freq = FreqDist(tokens)

# Plot the word frequency
fig = px.bar(x=list(word_freq.keys()), y=list(word_freq.values()), title='Word Frequency')
fig.show()

# Plot the word cloud using the word frequency
wordcloud = WordCloud(width=800, height=400).generate_from_frequencies(word_freq)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

# Compute 2-gram frequency
two_grams = ngrams(tokens, 2)
two_gram_freq = FreqDist(two_grams)

# Plot the 2-gram frequency
fig = px.bar(x=list(two_gram_freq.keys()), y=list(two_gram_freq.values()), title='2-Gram Frequency')
fig.show()


# Plot the word cloud using the 2-gram frequency
wordcloud = WordCloud(width=800, height=400).generate_from_frequencies(two_gram_freq)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

In [None]:
import re

def clean_footer(text, footer_markers):
    # loop through all the markers to find the start of the footer
    for marker in footer_markers:
        footer_start = text.find(marker)
        if footer_start != -1:
            break
    else:
        return text

    # split the text into main text and footer
    main_text = text[:footer_start]
    footer = text[footer_start:]

    # # perform text processing on the footer
    # footer = footer.lower()
    # footer = re.sub(r'[^\w\s]', '', footer)
    # footer = " ".join([word for word in footer.split() if word not in stop_words])
    # footer = stemmer.stem(footer)

    return main_text, footer

email_text = "This is an email text...\n\nFooter:\nContact Us: email@example.com\nCopyright 2021..."
footer_markers = ["Footer:", "Copyright"]
main_text, cleaned_footer = clean_footer(email_text, footer_markers)

In [None]:
import nltk
from nltk import FreqDist
import plotly.express as px

n = 20  # number of top-n frequent items to plot

def get_word_freq(text, n):
    # tokenize the text into words
    words = nltk.word_tokenize(text)
    fdist = FreqDist(words)
    top_n_words = fdist.most_common(n)
    return top_n_words

def get_ngram_freq(text, n, ngram_range=(2, 2)):
    # tokenize the text into words
    words = nltk.word_tokenize(text)
    # get the n-grams
    ngrams = nltk.ngrams(words, ngram_range[1])
    fdist = FreqDist(ngrams)
    top_n_ngrams = fdist.most_common(n)
    return top_n_ngrams

# get word frequency
word_freq = get_word_freq(cleaned_footer, n)
# plot the word frequency
fig = px.bar(word_freq, x=[x[0] for x in word_freq], y=[x[1] for x in word_freq], title="Word Frequency")
fig.show()

# get 2-gram frequency
ngram_freq = get_ngram_freq(cleaned_footer, n)
# plot the 2-gram frequency
fig = px.bar(ngram_freq, x=[x[0] for x in ngram_freq], y=[x[1] for x in ngram_freq], title="2-gram Frequency")
fig.show()


In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words_french = set(stopwords.words('french'))
stop_words_english = set(stopwords.words('english'))
stop_words = stop_words_french.union(stop_words_english)

def remove_stop_words(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(filtered_tokens)

# Example usage
cleaned_footer = "the text of the footer after cleaning"
filtered_footer = remove_stop_words(cleaned_footer)