# 1. Problem Information
- **Name:** [**Automated spam email detection system**](https://platform.olimpiada-ai.ro/en/problems/58)
- **Date:** 12/02/2026
- **Type:** Binary Classification

# 2. Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
import regex as re

# 3. Data preparation

In [None]:
train = pd.read_csv("data/ignore/train.csv")
test = pd.read_csv("data/ignore/test.csv")
print(train.shape)
train.head(5)

(66758, 3)


Unnamed: 0,sample_id,label,text
0,6031,0,mumia w wrote on escapenumber escapenumber esc...
1,75167,1,oh my dear sir her throat is so much better t...
2,69801,1,many more are victims of other cons telemarket...
3,78124,1,hello\nbest erection drugs\nworld wide shiping...
4,39905,0,may escapenumber escapenumber home u s world p...


In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66758 entries, 0 to 66757
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sample_id  66758 non-null  int64 
 1   label      66758 non-null  int64 
 2   text       66758 non-null  object
dtypes: int64(2), object(1)
memory usage: 1.5+ MB


# 4. Models

In [4]:
X = train['text']
Y = train['label']

In [5]:
pipeline = make_pipeline(TfidfVectorizer(lowercase=True,stop_words='english',min_df=3,max_df=0.8),MultinomialNB())
scores = cross_val_score(pipeline,X,Y,cv=2,scoring='roc_auc')
print(scores.mean())

0.9965330161865736


In [6]:
pipeline.fit(X,Y)
predictions = pipeline.predict_proba(test['text'])

# 5. Submission

In [7]:
task1 = test['text'].apply(lambda x : len(x))
task2 = test['text'].str.count(r'\bfree\b', flags=re.IGNORECASE)

np.int64(3738)

In [8]:
df_task1 = pd.DataFrame({
    "subtaskID": [1]*len(task1),
    "datapointID": test['sample_id'],
    "answer": task1
})

df_task2 = pd.DataFrame({
    "subtaskID": [2]*len(task1),
    "datapointID": test['sample_id'],
    "answer": task2
})

df_task3 = pd.DataFrame({
    "subtaskID": [3]*len(predictions),
    "datapointID":  test['sample_id'],
    "answer": predictions[:,1]
})
submission = pd.concat([df_task1, df_task2,df_task3])
submission.head()

Unnamed: 0,subtaskID,datapointID,answer
0,1,75707,237.0
1,1,4026,946.0
2,1,2156,1554.0
3,1,49593,1207.0
4,1,23028,1684.0


In [9]:
submission.to_csv("submission.csv", index=False)