<a href="https://colab.research.google.com/github/Ayushverma41/Mental-State-Prediction-using-NLP/blob/main/Code/Data%20Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Saving the cleaned data**

In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv("Combined_Data.csv")

# Rename columns (optional but recommended)
df.columns = ["serial_no", "statements", "labels"]

# Drop serial number column
df.drop(columns=["serial_no"], inplace=True)

# Text cleaning function
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+", "", text)        # remove URLs
    text = re.sub(r"\S+@\S+", "", text)               # remove emails
    text = re.sub(r"[^a-z\s]", "", text)              # remove numbers & special chars
    text = re.sub(r"\s+", " ", text).strip()          # remove extra spaces
    return text

# Apply cleaning
df["statements"] = df["statements"].apply(clean_text)

# Remove empty statements and missing labels
df.dropna(subset=["statements", "labels"], inplace=True)
df = df[df["statements"].str.len() > 0]

# Save cleaned dataset
df.to_csv("Cleaned_Combined_Data.csv", index=False)

# Train-test split (80% train, 20% test)
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df["labels"]  # keeps label distribution
)

# Save train and test datasets
train_df.to_csv("Train_Data.csv", index=False)
test_df.to_csv("Test_Data.csv", index=False)

print("Datasets created successfully:")
print(" - Cleaned_Combined_Data.csv")
print(" - Train_Data.csv")
print(" - Test_Data.csv")
