<a href="https://colab.research.google.com/github/2303A51965/NLP/blob/main/lab_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task 1: Dataset Loading & Cleaning

In [None]:
import pandas as pd
import spacy
import re

# Load dataset

In [None]:
df = pd.read_csv("StressLevelDataset.csv")

# Inspect structure

In [None]:
print("Dataset Shape:", df.shape)
print(df.info())

Dataset Shape: (1100, 21)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1100 entries, 0 to 1099
Data columns (total 21 columns):
 #   Column                        Non-Null Count  Dtype
---  ------                        --------------  -----
 0   anxiety_level                 1100 non-null   int64
 1   self_esteem                   1100 non-null   int64
 2   mental_health_history         1100 non-null   int64
 3   depression                    1100 non-null   int64
 4   headache                      1100 non-null   int64
 5   blood_pressure                1100 non-null   int64
 6   sleep_quality                 1100 non-null   int64
 7   breathing_problem             1100 non-null   int64
 8   noise_level                   1100 non-null   int64
 9   living_conditions             1100 non-null   int64
 10  safety                        1100 non-null   int64
 11  basic_needs                   1100 non-null   int64
 12  academic_performance          1100 non-null   int64
 13  study_l

# Display first 5 text entries from "text_1"

In [None]:
print("\nFirst 5 entries from anxiety_level column:")
print(df['anxiety_level'].head())


First 5 entries from anxiety_level column:
0    14
1    15
2    12
3    16
4    16
Name: anxiety_level, dtype: int64


# Clean null values if any

In [None]:
df = df.dropna(subset=['anxiety_level'])
print("\nAfter cleaning null values, new shape:", df.shape)


After cleaning null values, new shape: (1100, 21)


#task 2 POS Tagging with spaCy

# Load English model

In [None]:
nlp = spacy.load("en_core_web_sm")

# Select first 5 sentences from anxiety_level

In [None]:
sample_texts = df['anxiety_level'].head(5)

for i, sentence in enumerate('sample_texts', 1):
    doc = nlp(sentence)

     # Print POS tagging
    for token in doc:
        print(f"{token.text:15} --> {token.pos_}")

        # Extract specific POS
    nouns = [token.text for token in doc if token.pos_ == "NOUN"]
    verbs = [token.text for token in doc if token.pos_ == "VERB"]
    adjs  = [token.text for token in doc if token.pos_ == "ADJ"]

    print("Nouns:", nouns)
    print("Verbs:", verbs)
    print("Adjectives:", adjs)

s               --> X
Nouns: []
Verbs: []
Adjectives: []
a               --> PRON
Nouns: []
Verbs: []
Adjectives: []
m               --> VERB
Nouns: []
Verbs: ['m']
Adjectives: []
p               --> X
Nouns: []
Verbs: []
Adjectives: []
l               --> NOUN
Nouns: ['l']
Verbs: []
Adjectives: []
e               --> X
Nouns: []
Verbs: []
Adjectives: []
_               --> PUNCT
Nouns: []
Verbs: []
Adjectives: []
t               --> PROPN
Nouns: []
Verbs: []
Adjectives: []
e               --> X
Nouns: []
Verbs: []
Adjectives: []
x               --> X
Nouns: []
Verbs: []
Adjectives: []
t               --> PROPN
Nouns: []
Verbs: []
Adjectives: []
s               --> X
Nouns: []
Verbs: []
Adjectives: []


Q2: Regex Cleaning

In [None]:
texts = [
    "My phone number is 1234567890 and my email is test@domain.com",
    "Visit https://example.com for more info!!!",
    "HELLO!!! This is SOOOOO exciting :))",
    "Contact us at info@company.org or call +91 98765-43210",
    "Python's regex is very useful!!!  #Coding #Fun"
]

In [None]:
def clean_text(text):
    # Remove phone numbers
    text = re.sub(r'\+?\d[\d\s\-\(\)]{8,}\d', '', text)
    # Remove emails
    text = re.sub(r'\S+@\S+', '', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\.\S+', '', text)
    # Remove special characters (keep alphanumeric & spaces)
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
print("\n=== Cleaning Q2 Texts ===")
for t in texts:
    print("Original:", t)
    print("Cleaned :", clean_text(t))
    print("-" * 40)


=== Cleaning Q2 Texts ===
Original: My phone number is 1234567890 and my email is test@domain.com
Cleaned : My phone number is and my email is
----------------------------------------
Original: Visit https://example.com for more info!!!
Cleaned : Visit for more info
----------------------------------------
Original: HELLO!!! This is SOOOOO exciting :))
Cleaned : HELLO This is SOOOOO exciting
----------------------------------------
Original: Contact us at info@company.org or call +91 98765-43210
Cleaned : Contact us at or call
----------------------------------------
Original: Python's regex is very useful!!!  #Coding #Fun
Cleaned : Pythons regex is very useful Coding Fun
----------------------------------------
