In [81]:
import re
import pandas as pd
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline  # Don't forget this import
from sklearn.model_selection import GridSearchCV

In [82]:
data = pd.read_csv('resume.csv')
data

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."
...,...,...
957,Testing,Computer Skills: â¢ Proficient in MS office (...
958,Testing,â Willingness to accept the challenges. â ...
959,Testing,"PERSONAL SKILLS â¢ Quick learner, â¢ Eagerne..."
960,Testing,COMPUTER SKILLS & SOFTWARE KNOWLEDGE MS-Power ...


In [83]:
data.describe()

Unnamed: 0,Category,Resume
count,962,962
unique,25,166
top,Java Developer,"Technical Skills Web Technologies: Angular JS,..."
freq,84,18


In [84]:
data['Category'].unique()

array(['Data Science', 'HR', 'Advocate', 'Arts', 'Web Designing',
       'Mechanical Engineer', 'Sales', 'Health and fitness',
       'Civil Engineer', 'Java Developer', 'Business Analyst',
       'SAP Developer', 'Automation Testing', 'Electrical Engineering',
       'Operations Manager', 'Python Developer', 'DevOps Engineer',
       'Network Security Engineer', 'PMO', 'Database', 'Hadoop',
       'ETL Developer', 'DotNet Developer', 'Blockchain', 'Testing'],
      dtype=object)

In [85]:
data.isnull().sum()

Category    0
Resume      0
dtype: int64

In [86]:
data.drop_duplicates(subset="Resume", keep='first', inplace=True)

In [87]:
data.describe()

Unnamed: 0,Category,Resume
count,166,166
unique,25,166
top,Java Developer,Skills * Programming Languages: Python (pandas...
freq,13,1


In [88]:
data["Resume"][0]

'Skills * Programming Languages: Python (pandas, numpy, scipy, scikit-learn, matplotlib), Sql, Java, JavaScript/JQuery. * Machine learning: Regression, SVM, NaÃ¯ve Bayes, KNN, Random Forest, Decision Trees, Boosting techniques, Cluster Analysis, Word Embedding, Sentiment Analysis, Natural Language processing, Dimensionality reduction, Topic Modelling (LDA, NMF), PCA & Neural Nets. * Database Visualizations: Mysql, SqlServer, Cassandra, Hbase, ElasticSearch D3.js, DC.js, Plotly, kibana, matplotlib, ggplot, Tableau. * Others: Regular Expression, HTML, CSS, Angular 6, Logstash, Kafka, Python Flask, Git, Docker, computer vision - Open CV and understanding of Deep learning.Education Details \r\n\r\nData Science Assurance Associate \r\n\r\nData Science Assurance Associate - Ernst & Young LLP\r\nSkill Details \r\nJAVASCRIPT- Exprience - 24 months\r\njQuery- Exprience - 24 months\r\nPython- Exprience - 24 monthsCompany Details \r\ncompany - Ernst & Young LLP\r\ndescription - Fraud Investigatio

In [89]:

for idx, row in data["Resume"].items():
    cleaned_text = re.sub('[^a-zA-Z0-9]', " ", row.strip())  # Preserve spaces for readability
    cleaned_text = cleaned_text.replace("\r", '').replace("\n", '').replace("â", '').lower()
    data.loc[idx, "cleaned"] = cleaned_text


In [90]:
data

Unnamed: 0,Category,Resume,cleaned
0,Data Science,Skills * Programming Languages: Python (pandas...,skills programming languages python pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,education details may 2013 to may 2017 b e ...
2,Data Science,"Areas of Interest Deep Learning, Control Syste...",areas of interest deep learning control syste...
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,skills r python sap hana table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...",education details mca ymcaust faridabad...
...,...,...,...
894,Testing,Computer Skills: â¢ Proficient in MS office (...,computer skills proficient in ms office ...
895,Testing,â Willingness to accept the challenges. â ...,willingness to accept the challenges ...
896,Testing,"PERSONAL SKILLS â¢ Quick learner, â¢ Eagerne...",personal skills quick learner eagerne...
897,Testing,COMPUTER SKILLS & SOFTWARE KNOWLEDGE MS-Power ...,computer skills software knowledge ms power ...


In [91]:

import nltk
nltk.download('stopwords')

# Load English stopwords
stop_words = set(stopwords.words('english'))

# Function to remove stopwords from a text
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Apply the remove_stopwords function to the "cleaned" column
data["cleaned_no_stopwords"] = data["cleaned"].apply(remove_stopwords)
data

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ayushbachuwar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Category,Resume,cleaned,cleaned_no_stopwords
0,Data Science,Skills * Programming Languages: Python (pandas...,skills programming languages python pandas...,skills programming languages python pandas num...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,education details may 2013 to may 2017 b e ...,education details may 2013 may 2017 b e uit rg...
2,Data Science,"Areas of Interest Deep Learning, Control Syste...",areas of interest deep learning control syste...,areas interest deep learning control system de...
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,skills r python sap hana table...,skills r python sap hana tableau sap hana sql ...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...",education details mca ymcaust faridabad...,education details mca ymcaust faridabad haryan...
...,...,...,...,...
894,Testing,Computer Skills: â¢ Proficient in MS office (...,computer skills proficient in ms office ...,computer skills proficient ms office word basi...
895,Testing,â Willingness to accept the challenges. â ...,willingness to accept the challenges ...,willingness accept challenges positive thinkin...
896,Testing,"PERSONAL SKILLS â¢ Quick learner, â¢ Eagerne...",personal skills quick learner eagerne...,personal skills quick learner eagerness learn ...
897,Testing,COMPUTER SKILLS & SOFTWARE KNOWLEDGE MS-Power ...,computer skills software knowledge ms power ...,computer skills software knowledge ms power po...


In [92]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data['cleaned_no_stopwords'], data['Category'], test_size=0.2, random_state=42
)

# Create a pipeline with TF-IDF vectorizer and RandomForestClassifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=1000)),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Define hyperparameters for tuning (you may need to fine-tune this)
parameters = {
    'tfidf__max_features': [500, 1000, 2000],
    'classifier__n_estimators': [50, 100, 200]
}

# Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(pipeline, parameters, cv=5)
grid_search.fit(X_train, y_train)

# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Evaluate the model on the test set
y_pred = best_model.predict(X_test)

# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:\n", classification_rep)




Accuracy: 0.7941176470588235
Classification Report:
                            precision    recall  f1-score   support

                 Advocate       1.00      1.00      1.00         2
                     Arts       0.67      1.00      0.80         2
       Automation Testing       0.00      0.00      0.00         0
         Business Analyst       0.00      0.00      0.00         3
           Civil Engineer       1.00      1.00      1.00         2
             Data Science       0.50      1.00      0.67         1
                 Database       0.67      1.00      0.80         2
          DevOps Engineer       0.00      0.00      0.00         0
            ETL Developer       0.00      0.00      0.00         1
   Electrical Engineering       1.00      1.00      1.00         1
                       HR       1.00      1.00      1.00         5
                   Hadoop       1.00      1.00      1.00         3
       Health and fitness       1.00      0.50      0.67         2
        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
