# Importing libararies

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import PorterStemmer,WordNetLemmatizer
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.metrics.pairwise import cosine_similarity
import re
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
import pickle
import warnings
warnings.filterwarnings('ignore')

### Reading initial Dataframe

Here we have merged the resumes and job discription together becouse we are useing **topic_modeling** from the text 
so it will help us in increasing the accuracy of the model to build good topics for accurate pridiction.

In [2]:
jobs_df = pd.read_csv("Merged_data.csv")
jobs_df.drop("Unnamed: 0", inplace= True,axis = 1)
jobs_df["Jobs"].value_counts()

Java Developer         84
Sales                  83
Testing                70
Data Science           51
Python Developer       48
Web Designing          45
Mechanical Engineer    40
Automation Testing     26
ux,designer            26
Civil Engineer         24
data,analyst           24
Name: Jobs, dtype: int64

# Text prepocessing

In every **NLP** model we need basic text preprocessing here we are creating a function that will do all the text preprocesing such as **remove URLs** , **remove hashtags** , **Punctuations** , **remove extra whitespace** etc.

In [3]:
# Creating a program to clean Job description by removing URLS, Whitespace ,Punctuations , remove mentions
# remove hashtags
def clean_function(resumeText):
    """ Cleaning / preprocessing text data"""
    resumeText = re.sub('http\S+\s*', ' ', resumeText)  # remove URLs
    resumeText = re.sub('RT|cc', ' ', resumeText)  # remove RT and cc
    resumeText = re.sub('#\S+', '', resumeText)  # remove hashtags
    resumeText = re.sub('@\S+', '  ', resumeText)  # remove mentions
    resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText)  # remove punctuations
    resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText) 
    resumeText = re.sub('\s+', ' ', resumeText)  # remove extra whitespace
    return resumeText

# Tokenization and stemming

Here we are creating a function that will do preprocessing of text along with **stemming and tokenization** of the words.

In [4]:
# After preprocessing the text data we do tokenization and stemming 
# Creating a program tp tokenize and stem our document
def tokenize_stem(series):
    """ Tokenization and stemming of documents"""
    tokenizer =TreebankWordTokenizer()
    stemmer = PorterStemmer()
    series = series.apply(lambda x: x.replace("\n", ' '))
    series = series.apply(lambda x: clean_function(x))    #text cleaning function/ Test preprocessing
    series = series.apply(lambda x: tokenizer.tokenize(x))
    series = series.apply(lambda x: [stemmer.stem(w) for w in x])
    series = series.apply(lambda x: ' '.join(x))
    return series 

### Turning job Discription into series


In [5]:
# Turning job Discription into series
series = tokenize_stem(jobs_df["Job Description"])
# series[0]

# Vectorization

Converting documents of words into a word vectors with the help of **Tfidf**

In [6]:
# Converting documents of words into a word vectors with the help of Tfidf
# Vectorizing Document
vec = TfidfVectorizer(stop_words= "english",)
doc_words = vec.fit_transform(series)
print(doc_words.A[0])

[0. 0. 0. ... 0. 0. 0.]


In [7]:
len(doc_words.A[0]) #Total features creatrd by tfidf

4499

Here we are building **Topic Modeling** which took input as word vectors and creat topics on its onw, here we created **20 Topic with 6 itteration** this topics will be behave as a independent features for our classification models .


# Building Topic Modling 

In [8]:
# Here we are building a topic Modiling LSD with the help of TruncatedSVD.
topic_m = TruncatedSVD(n_components=20, n_iter=6) #created topic_model with topic 20
topic_m = topic_m.fit(doc_words) #fit and transformed tfidf vectors.
doc_topic = topic_m.transform(doc_words)
print("model_component: ",topic_m.components_[0].shape)

model_component:  (4499,)


In [9]:
topic_m.components_.shape # Model with 20 topics and 4364 components

(20, 4499)

In [10]:
# Creating Programs to display topic numbers and words inside topics
def display_topics(model, feature_names, no_top_words, topic_names=None):
    '''
    displays topics and returns list of toppics
    '''

    topic_list = []
    for i, topic in enumerate(model.components_):
        if not topic_names or not topic_names[i]:
            print("\nTopic ", i)
        else:
            print("\nTopic: '",topic_names[i],"'")

        print(", ".join([feature_names[k]
                       for k in topic.argsort()[:-no_top_words - 1:-1]]))
        topic_list.append(", ".join([feature_names[k]
                       for k in topic.argsort()[:-no_top_words - 1:-1]]))
    return model.components_, topic_list

In [11]:
### Def function to dispay model component array , model, vectorizer,topic_list
def return_topics(series, num_topics, no_top_words, model, vectorizer):
    '''
    returns document_topic matrix and topic modeling model
    '''
    #turn job into series
    series = tokenize_stem(series)
    #transform series into corpus
    ex_label = [e[:30]+"..." for e in series]
    #set vectorizer ngrams = (2,2)
    vec = vectorizer(stop_words = 'english')

    doc_word = vec.fit_transform(series)

    #build model
    def_model = model(num_topics)
    def_model = def_model.fit(doc_word)
    doc_topic = def_model.transform(doc_word)
    #print('model components: ', def_model.components_[0].shape)
    #print('doc_topic', doc_topic[0])
    model_components, topic_list = display_topics(def_model, vec.get_feature_names(), no_top_words)
    return def_model.components_, doc_topic, def_model, vec, topic_list#, topics

In [12]:
array, doc, topic_model, vec, topic_list  = return_topics(jobs_df['Job Description'],20, 10, TruncatedSVD, TfidfVectorizer)



Topic  0
exprienc, month, test, java, develop, project, design, year, sale, python

Topic  1
java, exprienc, month, ajax, develop, spring, j2ee, jqueri, jsp, servlet

Topic  2
test, transform, window, java, check, autom, android, maharashtra, xp, manual

Topic  3
applic, project, ui, design, trust, photoshop, loan, role, websit, respons

Topic  4
pune, python, engin, mechan, januari, maharashtra, design, civil, june, project

Topic  5
sale, exprienc, month, offic, manag, cricket, ms, loan, bajaj, januari

Topic  6
python, data, month, exprienc, year, scienc, django, learn, rest, test

Topic  7
machin, mechan, data, exprienc, nagpur, plaster, engin, civil, month, android

Topic  8
electron, data, pcb, matlab, januari, qualiti, technolog, matrix, electr, 2010

Topic  9
electron, year, qualiti, exprienc, pcb, month, assembl, complet, nashik, hardwar

Topic  10
check, data, electr, resist, juli, scienc, good, analyt, ui, ir

Topic  11
use, pune, pcb, bootstrap, jqueri, matrix, matlab, syn

In [13]:
# Creating a new dataFrame with individual features are topics from topic modeling and dependent feature is jobs.

# Creating Dataframe for predictive model
# topic_df = pd.DataFrame(doc)
# topic_df.columns = ['Topic ' + str(i+1) for i in range(len(topic_df.columns)) ]
# topic_df['job'] = jobs_df.Jobs
# topic_df.to_csv('merged_topic.csv') 

Here we created our final data set for our classification model , i suggest u to use different classification models in my case **Random_forest worked well**

In [14]:
# From vectors of topic modling and df[Job] we created our new datafreame to predict accuracy.
df1 = pd.read_csv('merged_topic.csv')
df1.drop("Unnamed: 0",axis = 1,inplace= True)
df1

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10,...,Topic 12,Topic 13,Topic 14,Topic 15,Topic 16,Topic 17,Topic 18,Topic 19,Topic 20,job
0,0.262339,-0.014537,-0.105785,-0.028614,0.016251,-0.291180,0.147528,0.046010,0.103826,-0.024114,...,0.032609,-0.044105,0.069617,0.086200,-0.075034,0.072364,0.063650,0.044822,-0.057010,Data Science
1,0.235786,0.071434,-0.053580,-0.108751,0.028166,0.029274,0.238226,0.154460,0.046054,0.094442,...,-0.026736,0.034899,-0.004818,0.043134,-0.050834,0.022262,0.022838,0.049669,-0.039426,Data Science
2,0.359566,0.103821,-0.019467,-0.182431,0.097349,0.019818,0.237412,0.067296,0.116512,0.083251,...,-0.063437,0.015713,0.007060,0.071733,-0.013021,-0.039319,0.063786,0.029918,-0.043142,Data Science
3,0.342101,-0.053513,-0.100499,-0.017645,0.011028,-0.224039,0.173264,0.058909,0.043988,0.010256,...,-0.055015,-0.065209,0.017938,0.057563,-0.064367,-0.003682,0.011635,0.052728,-0.052765,Data Science
4,0.387734,0.218800,-0.033395,-0.186137,-0.031608,0.107499,0.341216,0.187209,0.072586,0.179337,...,-0.018439,0.051923,-0.025246,0.048195,-0.024963,0.025634,0.052697,0.045288,-0.173923,Data Science
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
516,0.235412,-0.143217,-0.170880,-0.050714,-0.010634,-0.284425,0.069923,0.095856,0.068074,-0.013810,...,0.038280,-0.006338,0.003024,-0.018697,0.017793,0.004649,0.002119,-0.027679,-0.088679,Data Science
517,0.175965,-0.093753,-0.131596,-0.038796,0.029315,-0.244000,0.078209,0.068859,0.069110,0.013058,...,0.013157,0.001362,0.004671,-0.030603,-0.016305,0.024579,-0.016979,0.008965,-0.047603,Data Science
518,0.247148,-0.187813,-0.185143,-0.081691,-0.037593,-0.263858,0.087064,0.095161,0.059386,-0.019152,...,0.053166,0.016870,-0.007275,-0.029939,0.019934,0.010247,-0.008598,-0.039058,-0.100412,Data Science
519,0.252654,-0.164322,-0.205087,-0.097670,0.020609,-0.317458,0.129338,0.133066,0.123531,-0.017406,...,0.011619,-0.032782,0.033246,-0.008420,-0.036806,0.032929,0.002773,0.015592,-0.065947,Data Science


# Predictive model training

In [15]:
x = df1.drop("job",axis = 1)
y = df1["job"]

In [16]:
#Splitinf data for training amd testing.
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state= 1,test_size= .3)

In [17]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(364, 20)
(364,)
(157, 20)
(157,)


In [18]:
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)
print('Training_accuracy: ', np.mean(cross_val_score(rfc, x_train, y_train, scoring = 'accuracy', cv=5)))
print('Testing_accuracy: ', accuracy_score(y_test, rfc.predict(x_test)))
# print(rfc.predict(x_test))

Training_accuracy:  0.9780441400304415
Testing_accuracy:  0.9808917197452229


# Hyper parameter tunning

In [19]:
# param_grid = {'n_estimators': [100,300, 400, 500, 600], 'max_depth': [3,7,9, 11]}
# search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
# search.fit(x_train, y_train)
# best_parameters = search.best_params_
# print(best_parameters)


{'max_depth': 9, 'n_estimators': 400}


In [21]:
# Created new Prediction models with best parameters 
rfc = RandomForestClassifier(n_estimators = 400, max_depth = 9)
rfc.fit(x_train, y_train)
print('Training_accuracy: ', np.mean(cross_val_score(rfc, x_train, y_train, scoring = 'accuracy', cv=5)))
print('Testing_accuracy: ', accuracy_score(y_test, rfc.predict(x_test)))

Training_accuracy:  0.9807838660578387
Testing_accuracy:  0.9808917197452229


Here we can see that after Hyper parameter tunning we got the **Accuracy of training -98 and testing - 98** so we can conclude that our model is working well

In [22]:
# Defining a function which convert resume and give job pridiction probablities
def predict_resume(topic_model, model, resume):
    '''
    transforms a resume based on the topic modeling model and return prediction probabilities per each job class
    '''
    doc = topic_model.transform(resume) # frist preprocess data into array and then transform to 
    return model.predict_proba(doc), model.classes_


In [23]:
def get_topic_classification_models():
    jobs_df, model, vec , topic_list= process_data()
    model_1 = predictive_modeling(jobs_df)
    return model, model_1, vec                      # Model_1 is rfs , model is TruncateSVD, vec is Tfidf
# Pickling the model , classifier and vectors for future use.
# topic_model, classifier, vec= get_topic_classification_models()
# topic_model_name = 'topic_model.sav'
# classifier_name = 'classification_model.sav'
# vec_name = 'job_vec.sav'
# pickle.dump(topic_model, open(topic_model_name, 'wb'))
# pickle.dump(classifier, open(classifier_name, 'wb'))
# pickle.dump(vec, open(vec_name, 'wb'))


In [24]:
# Both this upper program requires for our final function to show resume matching jon probablities

# Showing Resume matching job probablities

def Show_prob(resume, topic_model, predictor, vec):
    '''
    run code that predicts resume
    '''
    #jobs_df, model, vec , topic_list= process_data()
    #model_1 = predictive_modeling(jobs_df)
    series =pd.Series(resume)
    doc = tokenize_stem(series)
    doc = vec.transform(doc)
    probabilities, classes = predict_resume(topic_model, predictor, doc)
    result = pd.DataFrame(zip(classes, probabilities[0]*100), columns = ['Jobs', 'Matching_Probablity'])
    return result


# Testing Sample Resume on our model

In [25]:
manager = """Administrative Assistant with 6+ years of experience preparing flawless presentations, assembling facility reports, and maintaining the utmost confidentiality. Possesses a B.A. in History and expertise in Microsoft Excel. 
Looking to leverage my knowledge and experience into a role as Project Manager.

Professional Experience

ADMINISTRATIVE ASSISTANT
REDFORD & SONS – Chicago, IL 
SEP 2019  
 –  Present
    • Schedule and coordinate meetings, appointments, and travel arrangements for supervisors and managers
    • Trained 2 administrative assistants during a period of company expansion to ensure attention to detail and adherence to company
    • Developed new filing and organizational practices, saving the company $3,000 per year in contracted labor expenses
    • Maintain utmost discretion when dealing with sensitive topics
    • Manage travel and expense reports for department team members
SECRETARY
BRIGHT SPOT LTD – Boston, MA 
JUN 2017
–  AUG 2019
    • Type documents such as correspondence, drafts, memos, and emails, and prepared 3 reports weekly for management
    • Opened, sorted, and distributed incoming messages and correspondence
    • Purchased and maintained office suppled inventories, and always carefully adhered to budgeting practices
    • Greeted visitors and helped them either find the appropriate person or schedule an appointment
    • Recorded, transcribed, and distributed minutes of meetings
SECRETARY
SUNTRUST FINANCIAL – Chicago, IL 
JUN 2015 
– AUG 2017
    • Recorded, transcribed and distributed weekly meetings
    • Answered upwards of 20 phone calls daily, taking detailed messages
    • Arranged appointments and ensured executives arrived to meetings with 
clients on time
"""

In [26]:
Show_prob(manager,topic_m,rfc,vec)

Unnamed: 0,Jobs,Matching_Probablity
0,Automation Testing,8.028101
1,Civil Engineer,1.787866
2,Data Science,1.94877
3,Java Developer,5.767912
4,Mechanical Engineer,2.220644
5,Python Developer,1.047715
6,Sales,47.631957
7,Testing,9.916923
8,Web Designing,1.26044
9,"data,analyst",12.854215


In [27]:
ux = """EXPERTISE
Design Strategy, User Experience, Information Architecture, Usability, UI flows, Web / Mobile / Tablet application Design, Wireframe & Mockup Development, Rapid Prototyping, Visual Design, Corporate Branding
TOOLS
Fireworks, Photoshop, Illustrator, Flash, Dreamweaver, InDesign, OmniGraffle, HTML, CSS, JS
EXPERIENCE
Sr. UX Designer | Confidential,Mountain View, CA | 06/11 – Present
• Collaborated with Product Owners and multiple Design teams across the company
• Re-designed the UI and design patterns of an acquired product (zavers) to align with Google brand & products
• Presented design strategies at the UX reviews to the higher management
• Completed end-to-end Google coupons phase 1.0 design for both web and mobile 
• Initiated Google coupons phase 2.0 re-design for tighter integration with Google wallet and offers
Lead UX Designer | Confidential,San Francisco, CA | 04/10 – 06/11 
• Brainstormed and implemented UI solutions for various projects and multiple teams both local and remote
• As a CX lead I’d collaborated with User Researchers, Visual Designers and Content Strategists 
• Designed a common platform for legacy Wachovia users migrating to Wells Fargo for “Check Recovery Service” tool 
• Redesigned the “Mortgage Product Recommendation Tool”, also helped creating mockups and prototyped using Flash Catalyst for Usability Study
Sr. UX Designer | Confidential,San Jose, CA | 09/09 – 02/10 
• Worked with multiple Business units to create usable solutions for various projects 
• Redesigned AdCommerce UI flows and visual design for the pilot launch 
• Analyzed, implemented and revised designs based on research findings 
• Automated the existing manual Unpaid Item, also created UI and visuals for the same 
• Worked closely with the design standards group to stay compliant with eBay patterns
Lead UX Designer | Confidential,Redwood City, CA | 09/08 - 09/09 
• Worked closely with the Business & Product team to analyze project requirements 
• Defined page schematics to guide product decision making & Engineering development 
• Initiated User research, Usability & Concept testing 
• Created UI sketches, flows, wireframes & rapid prototypes for Web & Mobile platforms 
• Designed UI solutions for internal product & partners (e.g. Citi, MasterCard & Nokia)
Sr. UX Designer | Confidential,San Jose, CA | 06/07 - 09/08 
• Collaborated with cross-functional teams to provide UX support 
• Created wireframes, visual assets and high fidelity mockups 
• Designed the Virtual Terminal & Store Front applications for Merchant Services 
• Designed the Global Seller Registration signup process for Market Places 
• Reworked the Large & Small Merchant Business implementation guide per PayPal 
branding & also delivered UED spec for various projects
Sr. UI Developer | Confidential,Austin, TX | 09/04 - 05/07 
• Designed/maintained Visa Information Management Portal & other Financial portlets 
• Worked with Product Office to define new requirements 
• Develop and Prototype interaction while consulting with development team for feasibility 
• Coded & Drove static demos & implement feedback from Business Users 
• Tested application for business rules accuracy during post implementation cycle also created “Demo Videos” to support internationalization
Interactive Designer| Confidential,Atlanta, GA | 05/04 - 09/04 
• Gathered & Reviewed project requirements from Product Owners 
• Developed Flash based Interactive tutorials (CBT/WBT’s) 
• Created design templates & Interfaces 
• Audio & Video Editing
Lead Web Designer | Confidential,Chennai, India | 05/02 - 05/04 
• Developed & Designed Flash, Static & Dynamic Websites 
• Designed Corporate Identity, branding & Marketing materials 
• Design Team management, training & documentation 
• Communicated directly with End-Clients & Offshore Team
Creative Visualizer | Confidential,Chennai, India | 06/00 - 04/02 
• Brainstorm, Iterate & Storyboard creative design solutions 
• Visualize & Design for the Web & Print 
• Created User-Friendly interface, optimized for quick downloads 
• Chose Optimal Technology for Scalability and Speed, Single-Click access to information and concise content
Freelancer | Confidential,Oxfordshire, UK | 10/02 - 02/03 
• Designed Solutions for Online Business Promotion 
• Marketing Design Support targeting increase in seasonal promotions 
• Designed Mailers, Banners & Print collaterals for in-house & affiliates 
• Seasonal Website Updates and Maintenance
EDUCATION
Certified Usability Analyst
Bachelor of Information Technology
Diploma in Graphic Arts
Diploma in Digital Pre-Press"""

In [28]:
Show_prob(ux,topic_m,rfc,vec)

Unnamed: 0,Jobs,Matching_Probablity
0,Automation Testing,3.990327
1,Civil Engineer,0.0
2,Data Science,3.158514
3,Java Developer,1.791667
4,Mechanical Engineer,5.908929
5,Python Developer,0.625
6,Sales,6.767857
7,Testing,2.0
8,Web Designing,7.537558
9,"data,analyst",8.358631


In [29]:
ds = """Trish Mathers
Entry-Level Data Scientist
Innovative and scientiﬁcally rigorous recent graduate with a signiﬁcant data science internship experience to bring to the table. With a team-oriented attitude, I am eager to contribute my abilities in quantitative modeling and experimentation to enhance the experience of Pinterest users around the world.
tmathers@email.com (123) 456-7890
Bellevue, WA LinkedIn

WORK EXPERIENCE
Niantic
Data Scientist Intern
Seattle, WA | April 2020 - April 2021
    • Developed a program in SAS that automated reﬁnement of linear regression models for speciﬁc segments of a customer base that saved
22 hours of labor per month.
    • Received, cleaned, and prepped data from client using SAS, SQL, and Excel to help data scientists build marketing mix models that resulted in
a lift in ROI of 10 basis points.

Seattle University Tutor Center
Statistics and Mathematics Tutor
Seattle, WA | April 2019 - April 2020
    • Assessed students' learning to determine learning weaknesses and needs, successfully helping students perform 13% better in algebra, pre-
calculus, calculus, and statistics undergraduate courses.
    • Met with 30+ students per week through online learning platforms or in a 1:1 setting at the tutor center.
    • Scheduled weekly appointments for students, and set schedules for student statistics and math tutors.
    • Communicated with professors about curriculum, and submitted reports 2 times per week to maintain up-to-date learning plans for students.

PROJECTS
Fantasy Football Models
    • Aggregated and prepped 3 years of fantasy football projection data from 3 independent sources into a MySQL database.
    • Created a random forest model in SAS, combining disparate sources into one projection that outperformed the mean absolute error of the next
best projection by 15%.

Entertainment Engine
    • Aggregated data from IMDB and Rotten Tomatoes, and used k-nearest- neighbors in SAS, constructing an enhanced entertainment selection
targeted to reach 15- to 25-year-olds.
    • Improved methodologies to save an average of 12 minutes per movie selection and 3 minutes per song selection.
SKILLS
    • Programming: SAS (base SAS and Macros), SQL
    • Supervised Learning: linear and logistic regressions,
decision trees, support vector machines (SVM)
    • Unsupervised Learning: k- means clustering, principal
component analysis (PCA)
    • Data Visualization: Excel, Google Sheets

EDUCATION
B.S.
Mathematics and Economics Seattle University
September 2017 - April 2021 Seattle, WA
GPA: 3.7

RELEVANT COURSES
    • Intermediate programming
    • Probability & Statistics
    • Linear Algebra
    • Applied Econometrics
    • Game Theory
    • Calculus 1-3"""

In [30]:
Show_prob(ds,topic_m,rfc,vec)

Unnamed: 0,Jobs,Matching_Probablity
0,Automation Testing,2.503261
1,Civil Engineer,2.936322
2,Data Science,42.16314
3,Java Developer,0.443182
4,Mechanical Engineer,1.212121
5,Python Developer,1.310307
6,Sales,1.555156
7,Testing,0.87375
8,Web Designing,1.186368
9,"data,analyst",38.726088


In [31]:
anki = """ANKIT DHANORE
Seeking Data Scientist Position
Phone no :- 7020276280
Email	:- ankitd08.job@gmail.com
Address	:- MQ 641, shubhash nagar, ghugus, chandrapur, maharashtra ,442505 . Github	:- https://github.com/ankitd08/Personal-projects
Linkedin   :-  https://www.linkedin.com/in/ankit-dhanore-a63970259
ABOUT ME :

Seeking a data scientist position to use my learnings to help the business meet strategic and operational goals by identifying opportunities to deploy new technology in data science. Possess expertise in Python & Data Analytics, ML, NLP, and DL modeling, and proven ability to manage complex tasks.
PROFESSIONAL SYNOPSIS :
Sound Understanding in AI and Machine Learning & DSA (Data Structures & Algorithms). Proficient in Core Python and its libraries and Modeling - regression, classification, linear etc. Proficient in building highly scalable and optimized engines for Data Analytics.
Strong exposure in RDBMS (MySQL, MongoDB, Document DB, MSSQL) and Servers. Good in exposing and consuming APIs, RESTAPI, Web Services
DATA SCIENCE, PYTHON & ML :


Strong Python Programming skills using OOPS, Functions, Modularization & decorators. Sound Knowledge of Python’s Data Analysis and Machine Learning Libraries.
Development of REST APIs in Python. Experience in Web Framework Flask. Implementation of regularization techniques like Lasso and Ridge in regression.
Data mining algorithm experience in the families of predictive algorithms (Regression, KNN, Decision Trees) and clustering algorithms (k-means clustering).
Ability to process Text Processing using NLTK library and other Natural Language .
Thorough understanding of Probability and Statistics, Bayesian methods, Time Series analysis. Experience in data management tools – Non Relational and SQL databases.
Ability to use Web Scraping Tools as Beautiful Soup.
Source code management and Version Control system using Git and GitHub. Thorough understanding of supervised and unsupervised algorithms.
SKILLS :
Languages :- Python, SQL
Framework and Libraries :- Flask, Numpy, Pandas, Matplotlib, Seaborn, Scikit-learn, Tensorflow, OpenCV, Scipy etc
NLP and Deep Learning :- word2vec, word cloud, word embedding, Basic of Deep Learning and AWS. etc
ML algorithms :- Linear and logistic Regression, KNN, Support vector machine, Naive Bayes, Decision Tree, Random Forest, Adaboost, hierarchicalclustering, k means clustering.
STRENGTHS :
Ability to write a clean and production code with Object Oriented Programming in Python. Able to investigate Data Visualization and summarization techniques conveying key findings"""

In [32]:
Show_prob(anki,topic_m,rfc,vec)

Unnamed: 0,Jobs,Matching_Probablity
0,Automation Testing,0.517857
1,Civil Engineer,1.521739
2,Data Science,83.446518
3,Java Developer,0.625
4,Mechanical Engineer,0.75
5,Python Developer,2.0
6,Sales,0.259615
7,Testing,0.5
8,Web Designing,0.910714
9,"data,analyst",6.707686


In [33]:
sales = """Handled all client relations and sales for portfolio of 70-80 accounts. Conducted sales presentations to decision makers and generate quotes that meet each client’s needs. Worked effectively in remote and office settings while traveling ~50% of the time. Maintained in-depth knowledge of product and services portfolio.

Awarded President’s Club in 2018, 2019, and 2020 for achieving over 105% of sales goal.
Grew major client account by 50% by earning their trust and recommending service that solved one of their key problems.
Sourced 10 new clients through cold calling and digital lead conversion.
Organized monthly events to demonstrate products and services, building pipeline of over 600 prospective clients.
Selected for and completed sales leadership development program.
Promoted to invigorate stagnant sales growth and revitalize product launch/development. Managed $25M district P&L among 150 employees and 1.5K partners. Optimized retention strategies through renovating company value proposition to address clients’ changing needs. Maximized local resources and relationships to implement company-wide sales and marketing initiatives.

Managed customer relationships with local distribution centers and partners to drive district sales before initial product launch.
Achieved the highest district customer renewal rate, increasing 15% from prior year.
Rebuilt continuing education and training resources for associate sales representatives, raising individual quota achievement 17% over two years.
Oversee hiring, training and management of team of 15-20 sales representatives. Create strategy to continually achieve and exceed sales goals by prospecting clients and implementing effective sales processes. Directly manage key accounts and serve as escalation point for client issues and concerns.

Achieved 105% of sales quota in 2019 and 102% in 2020.
Introduced new coaching strategy for sales reps, creating customized plans that capitalize on individual strengths and motivators and holding monthly 1-on-1 meetings to review plan.
Implemented Salesforce to track over 10,000 prospective clients and manage 500+ existing client accounts.
Grew 5 major accounts by 30% by building client rapport and presenting solutions to client needs.
Created new sales materials for current and upcoming products."""

In [34]:
Show_prob(sales,topic_m,rfc,vec)

Unnamed: 0,Jobs,Matching_Probablity
0,Automation Testing,0.25
1,Civil Engineer,0.25
2,Data Science,0.754032
3,Java Developer,0.75
4,Mechanical Engineer,0.25
5,Python Developer,0.25
6,Sales,92.567396
7,Testing,1.678571
8,Web Designing,1.25
9,"data,analyst",0.75


In [36]:
civil = """Earl Beckstrom
Sometown, MI 48103
Phone: (555) 555-5555
Email: eb@somedomain.com | LinkedIn URL

CIVIL ENGINEER

Upcoming graduate of ABET-accredited bachelor’s in civil engineering program. Backed by successful internship experience and knowledge of engineering theories, principles, specifications and standards. Plan to earn Engineer in Training certification upon graduation.
Demonstrated 3D skills with the ability to design site layouts from concept through completion. Proficient user of AutoCAD Civil 3D, MicroStation and ArcGIS.
Knowledge of Sometown, MI Municipalities Planning Code and zoning, subdivision and storm water ordinances.
Education & Credentials

ABC UNIVERISTY, Sometown, MI
Bachelor of Science in Civil Engineering program, 120/132 credits completed

Honors: Chi Epsilon (Civil Engineering Honor Society), Dean’s List (5 semesters)
Activities: Member, American Society of Civil Engineers and Emerging Green Builders (EGB); Planning Committee, Engineering Expo
Course Highlights:
Civil Engineering Design
Cost Estimating & Surveying
Structural Analysis & Dynamics
Geotechnical Engineering
Construction Methods
Traffic & Materials Engineering
Environmental Engineering
Water Resource Engineering
Fluid Mechanics & Hydraulics
Concrete & Steel Design
Professional Experience

XYZ COMPANY, Sometown, MI
Engineering firm serving government and commercial clients.
Intern, Civil Engineering Group, September 2016 to Present

Assisted civil engineers on several key government projects involving roadway designs and improvements, solutions easing traffic congestion and replacement of deteriorating bridges.
Handled cost-of-materials estimations, report and document tracking, project documentation, on-site project visits, invoice/agreement verification and building permit applications.
Gained experience in blueprint reading, as well as preparation of maps and plans.
ABC COMPANY, Sometown, MI
Worked in telesales throughout college to help finance education.
Sales Representative, June 2014 to August 2016 (seasonal)

Cold-called small business owners to sign new accounts for ABC Company’s print advertising service.
Cultivated excellent relationships throughout assigned territory."""

In [37]:
Show_prob(civil,topic_m,rfc,vec)

Unnamed: 0,Jobs,Matching_Probablity
0,Automation Testing,6.452799
1,Civil Engineer,57.392628
2,Data Science,1.404942
3,Java Developer,1.859649
4,Mechanical Engineer,16.202174
5,Python Developer,2.471404
6,Sales,3.148549
7,Testing,1.227532
8,Web Designing,2.554762
9,"data,analyst",4.949601


### Checking imbalance of data for each catogeries

In [35]:
jobs_df["Jobs"].value_counts() # here we can see that job discription are imbalanced 

Java Developer         84
Sales                  83
Testing                70
Data Science           51
Python Developer       48
Web Designing          45
Mechanical Engineer    40
Automation Testing     26
ux,designer            26
Civil Engineer         24
data,analyst           24
Name: Jobs, dtype: int64