In [2]:
import pandas as pd
import os

# List all CSV files in the data/ directory
data_dir = 'data/'
csv_files = [f for f in os.listdir(data_dir) if f.endswith('.csv')]

print("CSV files found:")
for file in csv_files:
    print(f"  - {file}")

print("\n" + "="*80 + "\n")

# Read and print first few rows of each CSV file
for csv_file in csv_files:
    file_path = os.path.join(data_dir, csv_file)
    print(f"File: {csv_file}")
    print(f"Path: {file_path}")
    try:
        df = pd.read_csv(file_path)
        print(f"Shape: {df.shape} (rows, columns)")
        print(f"Columns: {list(df.columns)}")
        print("\nFirst 5 rows:")
        print(df.head())
    except Exception as e:
        print(f"Error reading file: {e}")
    print("\n" + "-"*80 + "\n")

CSV files found:
  - enron_spam_data.csv
  - spam_assassin.csv


File: enron_spam_data.csv
Path: data/enron_spam_data.csv
Shape: (33716, 5) (rows, columns)
Columns: ['SNo', 'Subject', 'Message', 'Spam/Ham', 'Date']

First 5 rows:
   SNo                       Subject  \
0    0  christmas tree farm pictures   
1    1      vastar resources , inc .   
2    2  calpine daily gas nomination   
3    3                    re : issue   
4    4     meter 7268 nov allocation   

                                             Message Spam/Ham        Date  
0                                                NaN      ham  1999-12-10  
1  gary , production from the high island larger ...      ham  1999-12-13  
2             - calpine daily gas nomination 1 . doc      ham  1999-12-14  
3  fyi - see note below - already done .\nstella\...      ham  1999-12-14  
4  fyi .\n- - - - - - - - - - - - - - - - - - - -...      ham  1999-12-14  

------------------------------------------------------------------------

In [3]:
enron = pd.read_csv('data/enron_spam_data.csv')
enron['Spam/Ham'].value_counts()

Spam/Ham
spam    17171
ham     16545
Name: count, dtype: int64

In [6]:
sp_assassin = pd.read_csv('data/spam_assassin.csv')
sp_assassin['target'].value_counts()

target
0    3900
1    1896
Name: count, dtype: int64

In [7]:
enron['text'] = enron['Subject'] + " " + enron['Message']
enron['target'] = enron['Spam/Ham'].map({'ham': 0, 'spam': 1})

In [8]:
enron_clean = enron[['text', 'target']].copy()
sp_assassin_clean = sp_assassin[['text', 'target']].copy()

In [15]:
enron_clean['original_dataset'] = 'enron'
sp_assassin_clean['original_dataset'] = 'spam_assassin'

In [16]:
enron_clean.head()

Unnamed: 0,text,target,original_dataset
0,,0,enron
1,"vastar resources , inc . gary , production fro...",0,enron
2,calpine daily gas nomination - calpine daily g...,0,enron
3,re : issue fyi - see note below - already done...,0,enron
4,meter 7268 nov allocation fyi .\n- - - - - - -...,0,enron


In [17]:
sp_assassin_clean.head()

Unnamed: 0,text,target,original_dataset
0,From ilug-admin@linux.ie Mon Jul 29 11:28:02 2...,0,spam_assassin
1,From gort44@excite.com Mon Jun 24 17:54:21 200...,1,spam_assassin
2,From fork-admin@xent.com Mon Jul 29 11:39:57 2...,1,spam_assassin
3,From dcm123@btamail.net.cn Mon Jun 24 17:49:23...,1,spam_assassin
4,From ilug-admin@linux.ie Mon Aug 19 11:02:47 2...,0,spam_assassin


In [20]:
final_df = pd.concat([enron_clean, sp_assassin_clean], axis=0, ignore_index=True)
final_df['stratify_key'] = final_df['target'].astype(str) + "_" + final_df['original_dataset'].astype(str)

final_df.head()

Unnamed: 0,text,target,original_dataset,stratify_key
0,,0,enron,0_enron
1,"vastar resources , inc . gary , production fro...",0,enron,0_enron
2,calpine daily gas nomination - calpine daily g...,0,enron,0_enron
3,re : issue fyi - see note below - already done...,0,enron,0_enron
4,meter 7268 nov allocation fyi .\n- - - - - - -...,0,enron,0_enron


In [22]:
final_df['stratify_key'].value_counts()

stratify_key
1_enron            17171
0_enron            16545
0_spam_assassin     3900
1_spam_assassin     1896
Name: count, dtype: int64

In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    final_df[['text', 'original_dataset']],  # Keeping origin in X to verify later if needed
    final_df['target'], 
    test_size=0.2, 
    stratify=final_df['stratify_key'], # <--- The Magic Step
    random_state=42
)

In [25]:
test_check = X_test.copy()
test_check['target'] = y_test

print("\n--- Proportions in Original Data ---")
print(final_df.groupby(['original_dataset', 'target']).size() / len(final_df))

print("\n--- Proportions in Test Set (Should match Original closely) ---")
print(test_check.groupby(['original_dataset', 'target']).size() / len(test_check))


--- Proportions in Original Data ---
original_dataset  target
enron             0         0.418734
                  1         0.434577
spam_assassin     0         0.098704
                  1         0.047985
dtype: float64

--- Proportions in Test Set (Should match Original closely) ---
original_dataset  target
enron             0         0.418702
                  1         0.434645
spam_assassin     0         0.098697
                  1         0.047956
dtype: float64


In [10]:
final_df = pd.concat([enron_clean, sp_assassin_clean], axis=0, ignore_index=True)

final_df.shape

(39512, 2)

In [13]:
print(f"Shape: {final_df.shape} (rows, columns)")
print(f"Columns: {list(final_df.columns)}")
print("\nFirst 5 rows:")
print(final_df.head())

Shape: (39512, 2) (rows, columns)
Columns: ['text', 'target']

First 5 rows:
                                                text  target
0                                                NaN       0
1  vastar resources , inc . gary , production fro...       0
2  calpine daily gas nomination - calpine daily g...       0
3  re : issue fyi - see note below - already done...       0
4  meter 7268 nov allocation fyi .\n- - - - - - -...       0


In [27]:
pd.read_csv('data/train_data.csv').head()

Unnamed: 0,text,original_dataset,label
0,"fw : memo : re : your work phone number hi ,\n...",enron,1
1,transwestern capacity release report - 06 / 20...,enron,0
2,From pudge@perl.org Thu Oct 3 12:22:42 2002 Re...,spam_assassin,0
3,start date : 2 / 6 / 02 ; hourahead hour : 24 ...,enron,1
4,"fw : memo : re : your work phone number hi ,\n...",enron,1
