<a href="https://colab.research.google.com/github/BhanuDanda/NLP/blob/main/04-08-2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter
import spacy

data = {
    "Resume": [
        "Experienced Software Engineer skilled in Python, Java, and cloud platforms.\n Passionate about AI • ML • DL.",
        "Data Analyst with 3+ years of experience. Strong skills in SQL, Excel, and visualization tools (Tableau/Power BI).",
        "Network Administrator - Expertise in Cisco routers, firewalls, VPNs & security monitoring. Good troubleshooting skills."
    ]
}

df = pd.DataFrame(data)

print("---- First 3 Resume Samples ----")
print(df.head(3))
print("\nChecking noisy characters (\\n, •, symbols):")
for text in df["Resume"]:
    print(re.findall(r"[\\n•@&/]", text))




---- First 3 Resume Samples ----
                                              Resume
0  Experienced Software Engineer skilled in Pytho...
1  Data Analyst with 3+ years of experience. Stro...
2  Network Administrator - Expertise in Cisco rou...

Checking noisy characters (\n, •, symbols):
['n', 'n', 'n', 'n', 'n', 'n', 'n', '•', '•']
['n', 'n', 'n', 'n', 'n', 'n', '/']
['n', 'n', '&', 'n', 'n', 'n']


In [9]:
import nltk
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("stopwords")

stop_words = set(stopwords.words("english"))
ps = PorterStemmer()

def preprocess_nltk(text):

    text = re.sub(r"[^a-zA-Z\s]", "", text)

    tokens = word_tokenize(text.lower())

    tokens = [w for w in tokens if w not in stop_words]

    tokens = [ps.stem(w) for w in tokens]
    return tokens

df["NLTK_Tokens"] = df["Resume"].apply(preprocess_nltk)

all_words = [word for tokens in df["NLTK_Tokens"] for word in tokens]
freq_nltk = Counter(all_words).most_common(10)

print("\n---- Top 10 Frequent Stemmed Words (NLTK) ----")
print(freq_nltk)



---- Top 10 Frequent Stemmed Words (NLTK) ----
[('skill', 3), ('experienc', 1), ('softwar', 1), ('engin', 1), ('python', 1), ('java', 1), ('cloud', 1), ('platform', 1), ('passion', 1), ('ai', 1)]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
nlp = spacy.load("en_core_web_sm")

def preprocess_spacy(text):
    doc = nlp(text.lower())

    tokens = [token.lemma_ for token in doc
              if token.is_alpha and token.pos_ in ["NOUN", "VERB"]]
    return tokens

df["Spacy_Tokens"] = df["Resume"].apply(preprocess_spacy)

all_lemmas = [word for tokens in df["Spacy_Tokens"] for word in tokens]
freq_spacy = Counter(all_lemmas).most_common(10)

print("\n---- Top 10 Frequent Lemmas (spaCy) ----")
print(freq_spacy)

df = pd.DataFrame(data)
print("---- First 3 Resume Samples ----")
print(df.head(3))
print("\nChecking noisy characters (\\n, •, symbols):")
for text in df["Resume"]:
    print(re.findall(r"[\\n•@&/]", text))



---- Top 10 Frequent Lemmas (spaCy) ----
[('experience', 2), ('skill', 2), ('software', 1), ('engineer', 1), ('python', 1), ('cloud', 1), ('platform', 1), ('passionate', 1), ('ai', 1), ('dl', 1)]
---- First 3 Resume Samples ----
                                              Resume
0  Experienced Software Engineer skilled in Pytho...
1  Data Analyst with 3+ years of experience. Stro...
2  Network Administrator - Expertise in Cisco rou...

Checking noisy characters (\n, •, symbols):
['n', 'n', 'n', 'n', 'n', 'n', 'n', '•', '•']
['n', 'n', 'n', 'n', 'n', 'n', '/']
['n', 'n', '&', 'n', 'n', 'n']
