Importing Libraries

In [86]:
import numpy as np
import pandas as pd


Load Data Set

In [87]:
df = pd.read_csv("..\Data\combined_data.csv")
df.head()

Unnamed: 0,label,text
0,1,ounce feather bowl hummingbird opec moment ala...
1,1,wulvob get your medircations online qnb ikud v...
2,0,computer connection from cnn com wednesday es...
3,1,university degree obtain a prosperous future m...
4,0,thanks for all your answers guys i know i shou...


In [88]:
# Dataset Info
# Checking information about the dataset using info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83448 entries, 0 to 83447
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   83448 non-null  int64 
 1   text    83448 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.3+ MB


Shape of Data set

In [89]:
# Checking number of rows and columns of the dataset using shape
print("Number of rows are: ",df.shape[0])
print("Number of columns are: ",df.shape[1])

Number of rows are:  83448
Number of columns are:  2


In [90]:
df.describe()

Unnamed: 0,label
count,83448.0
mean,0.526196
std,0.499316
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [91]:
print("\n=== VALUE COUNTS FOR EACH COLUMN ===")
for column in df.columns:
    print(f"\n--- {column} ---")
    print(f"Data type: {df[column].dtype}")
    print(f"Unique values: {df[column].nunique()}")
    print(f"Missing values: {df[column].isna().sum()}")
    print("Value counts:")
    print(df[column].value_counts().head(10))


=== VALUE COUNTS FOR EACH COLUMN ===

--- label ---
Data type: int64
Unique values: 2
Missing values: 0
Value counts:
label
1    43910
0    39538
Name: count, dtype: int64

--- text ---
Data type: object
Unique values: 83446
Missing values: 0
Value counts:
text
unsubscribe                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      

Duplicated values

In [92]:
# Dataset Duplicate Value Count
dup = df.duplicated().sum()
print(f'number of duplicated rows are {dup}')

number of duplicated rows are 0


Missing values

In [93]:
# Missing Values/Null Values Count
df.isnull().sum()

label    0
text     0
dtype: int64

In [94]:
df.columns

Index(['label', 'text'], dtype='object')

Describe Data set

In [95]:
# Dataset Describe (all columns included)
df.describe(include= 'all').round(2)

Unnamed: 0,label,text
count,83448.0,83448
unique,,83446
top,,unsubscribe
freq,,2
mean,0.53,
std,0.5,
min,0.0,
25%,0.0,
50%,1.0,
75%,1.0,


In [96]:
# Check Unique Values for each variable using a for loop.
for i in df.columns.tolist():
  print("No. of unique values in",i,"is",df[i].nunique())

No. of unique values in label is 2
No. of unique values in text is 83446


Cleaning the Text

In [98]:
import re
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

def simple_clean(text):
    """Basic text cleaning without external dependencies"""
    if pd.isna(text):
        return ""
    
    text = str(text)
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    text = ' '.join(text.lower().split())
    return text

df['cleaned_text'] = df['text'].apply(simple_clean)

vectorizer = CountVectorizer(
    stop_words='english',  
    lowercase=True,
    token_pattern=r'(?u)\b[a-zA-Z]{3,}\b'  
)
cleaned_docs = []
for text in df['cleaned_text']:
    words = text.split()
    words = [word for word in words if len(word) >= 3]
    cleaned_docs.append(' '.join(words))

df['cleaned_text'] = cleaned_docs

In [99]:
df = df.drop(columns=['text'])

In [100]:
# Save to CSV file
df.to_csv('cleaned_dataset.csv', index=False)
print("✅ Dataset saved as 'cleaned_dataset.csv'")

✅ Dataset saved as 'cleaned_dataset.csv'
