# Import Required Libraries
Import the necessary libraries such as pandas, numpy, matplotlib, seaborn, and wordcloud.

In [None]:
import re
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download("stopwords")

# Load Data
Load the dataset from a CSV file into a pandas DataFrame.

In [None]:
# Load the data
file_path = 'data/Resume.csv'
df = pd.read_csv(file_path)

# Display the first few rows of the dataframe
df.head()

# Data Overview
Display the first few rows of the dataset and get basic information such as shape, columns, and data types.

In [None]:
# drop unused columns
#del df['ID']
# del df['Resume_html']
# df.head()

In [None]:
# Check the columns in the DataFrame
print(df.columns)

In [None]:
# Display basic information about the dataframe
df.info()

In [None]:
# Display summary statistics
df.describe()

In [None]:
# Basic statistics for textual data
df['Resume_str_length'] = df['Resume_str'].apply(len)
df['Resume_html_length'] = df['Resume_html'].apply(len)

# Category distribution
category_distribution = df['Category'].value_counts()

# Summary of textual lengths
resume_str_summary = df['Resume_str_length'].describe()
resume_html_summary = df['Resume_html_length'].describe()

category_distribution, resume_str_summary, resume_html_summary

# Missing Values Analysis
Check for missing values in the dataset and handle them appropriately.

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing Values:\n", missing_values)

In [None]:
print("Data Types:\n", df.dtypes)

# Text Data Preprocessing
Preprocess the text data by removing punctuation, stop words, and performing tokenization.

In [None]:
# Ensure NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
#  Text Cleaning 
# Remove numeric data, punctuation, and redundant spaces.
# Handle lemmatization or stemming for reducing words to their base forms.
# Function to clean the resume text
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', ' ', text)
    # Remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub('http\S+\s*', ' ', text)  # remove URLs
    text = re.sub('RT|cc', ' ', text)  # remove RT and cc
    text = re.sub('#\S+', '', text)  # remove hashtags
    text = re.sub('@\S+', '  ', text)  # remove mentions
    text = re.sub('\s+', ' ', text)  # remove extra whitespace

     # Tokenize text
    tokens = word_tokenize(text)
   # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)
    

# # preprocessing text
# df['Resume'] = df['Resume'].apply(lambda w: preprocess(w))

# Apply cleaning to the Resume_str column
df['Cleaned_Resume_str'] = df['Resume_str'].apply(clean_text)

# Display a sample of the cleaned text
df[['Resume_str', 'Cleaned_Resume_str']].head()



In [None]:
# 4. Categorical Encoding


# Encode the Category column
label_encoder = LabelEncoder()
df['Category_Encoded'] = label_encoder.fit_transform(df['Category'])

# Display the first few rows to see the encoded categories
df[['Category', 'Category_Encoded']].head()


In [None]:
# create list of all categories
categories = np.sort(df['Category'].unique())
categories

In [None]:
# create new df for corpus and category
df_categories = [df[df['Category'] == category].loc[:, ['Resume_str', 'Category']] for category in categories]
df_categories

In [None]:
# Check for duplicate data based on the 'Cleaned_Resume' column
duplicates = df[df.duplicated(subset=['Cleaned_Resume_str'])]

# Display the number of duplicates and some of the duplicated rows
duplicates_count = duplicates.shape[0]
if duplicates_count > 0:
    print(f"There are {duplicates_count} duplicate rows based on the 'Cleaned_Resume' column.")
    print(duplicates.head())  # Display a few rows with duplicates
else:
    print("No duplicates found.")

In [None]:
# resume_data = df.drop_duplicates(subset=['Cleaned_Resume'])

# Exploratory Data Analysis
Perform EDA on the text data.

In [None]:

var_mod = ['Category']
le = LabelEncoder()
for i in var_mod:
    df[i] = le.fit_transform(df[i])


df

In [None]:

# Plot the distribution of job categories
plt.figure(figsize=(12, 6))
sns.countplot(data=df, x='Category', order=df['Category'].value_counts().index)
plt.xticks(rotation=90)
plt.title('Distribution of Job Categories')
plt.xlabel('Category')
plt.ylabel('Count')
plt.show()


In [None]:
from wordcloud import WordCloud

# Combine all resumes into a single string
text = ' '.join(df['Cleaned_Resume_str'])

# Generate the word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

# Display the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Resumes')
plt.show()


In [None]:
# Create a new column for the length of each resume
df['resume_length'] = df['Cleaned_Resume_str'].apply(len)

# Plot the distribution of resume lengths
plt.figure(figsize=(12, 6))
sns.histplot(df['resume_length'], bins=30, kde=True)
plt.title('Distribution of Resume Lengths')
plt.xlabel('Resume Length')
plt.ylabel('Frequency')
plt.show()


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Function to plot N-grams
def plot_ngrams(text_data, ngram_range=(1, 1), num=20):
    vectorizer = CountVectorizer(ngram_range=ngram_range)
    ngrams = vectorizer.fit_transform(text_data)
    ngrams_sum = ngrams.sum(axis=0)
    ngrams_freq = [(word, ngrams_sum[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
    ngrams_freq = sorted(ngrams_freq, key=lambda x: x[1], reverse=True)[:num]
    
    ngrams_df = pd.DataFrame(ngrams_freq, columns=['N-gram', 'Frequency'])
    
    plt.figure(figsize=(12, 6))
    sns.barplot(x='Frequency', y='N-gram', data=ngrams_df)
    plt.title(f'Top {num} N-grams')
    plt.xlabel('Frequency')
    plt.ylabel('N-gram')
    plt.show()

# Plot top unigrams (single words)
plot_ngrams(df['Cleaned_Resume_str'], ngram_range=(1, 1), num=20)

# Plot top bigrams (two-word phrases)
plot_ngrams(df['Cleaned_Resume_str'], ngram_range=(2, 2), num=20)
