In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.ensemble import IsolationForest

# Read the CSV file
df = pd.read_csv('lemmatized_without_stopwords_puntuation_libraries.csv')

# Convert text to numeric features using TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=0.05, ngram_range=(1, 2))  # Adjust parameters as needed
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Processed_Message'])

# Scale the TF-IDF features using MaxAbsScaler
scaler = MaxAbsScaler()
tfidf_matrix_scaled = scaler.fit_transform(tfidf_matrix)

# Fit the Isolation Forest model
isolation_forest = IsolationForest(n_estimators=100, contamination=0.1, random_state=42)
isolation_forest.fit(tfidf_matrix_scaled)

# Predict anomalies
anomalies = isolation_forest.predict(tfidf_matrix_scaled)

# Assign anomaly labels to the DataFrame
df['Anomaly'] = anomalies

# Map numeric labels to meaningful labels
label_mapping = {1: 'normal', -1: 'anomaly'}
df['Anomaly_Label'] = df['Anomaly'].map(label_mapping)

# Drop the 'Anomaly' column
df = df.drop(columns=['Anomaly'])

# Count anomalies and normal emails
anomaly_counts = df['Anomaly_Label'].value_counts()
print("Counts of anomalies and normal emails:")
print(anomaly_counts)

# Display a sample of anomalous emails
print("Sample of anomalous emails:")
print(df[df['Anomaly_Label'] == 'anomaly'].sample(5))  # Display 5 random anomalous emails

# Optionally, save the results to a new CSV file
df.to_csv('emails_with_anomalies.csv', index=False)


Counts of anomalies and normal emails:
Anomaly_Label
normal     5156
anomaly     416
Name: count, dtype: int64
Sample of anomalous emails:
      Category                                  Processed_Message  \
4340         0  ['got', 'outta', 'class', 'gon', 'na', 'go', '...   
5029         0  ['go', 'chase', 'run', 'shes', 'crossing', 'st...   
1373         0  ['1', 'go', 'write', 'msg', '2', 'put', 'dicti...   
2377         0  ['im', 'way', 'home', 'went', 'change', 'batt'...   
2230         0  ['haha', 'money', 'leh', 'later', 'got', 'go',...   

     Anomaly_Label  
4340       anomaly  
5029       anomaly  
1373       anomaly  
2377       anomaly  
2230       anomaly  
