## PCA and Clustering 


In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score
from sklearn.metrics import euclidean_distances
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet

In [15]:
df = pd.read_csv('../data/Cleaned_Job_desc_data')


In [16]:
#Disparity / Class Balance Check
df['job_title'].value_counts()


financial+analyst    482
data+scientist       371
physician            325
underwriter          299
chemical+engineer    252
recruiter            248
Name: job_title, dtype: int64

In [17]:
def remove_stopwords(stopWoFinancrds, descriptions):
    cleaned_descriptions = []
    for description in descriptions:
        temp_list = []
        for word in description.split():
            if word not in stopWords:
                temp_list.append(word.lower())
        cleaned_descriptions.append(' '.join(temp_list))
    return np.array(cleaned_descriptions)

def remove_punctuation(descriptions):
    no_punct_descriptions = []
    for description in descriptions:
        description_no_punct = ' '.join(RegexpTokenizer(r'\w+').tokenize(description))
        no_punct_descriptions.append(description_no_punct)
    return np.array(no_punct_descriptions)

def get_wordnet_pos(word):
    # nltk.download()

    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {'J': wordnet.ADJ,
               'N': wordnet.NOUN,
               'V': wordnet.VERB,
               'R': wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def lemmatize_descriptions(descriptions):
    cleaned_descriptions = []
    for description in descriptions:
        temp_list = []
        for word in description.split():
            cleaned_word = WordNetLemmatizer().lemmatize(word, get_wordnet_pos(word))
            temp_list.append(cleaned_word)
        cleaned_descriptions.append(' '.join(temp_list))
    return np.array(cleaned_descriptions)

def clean_descriptions(stopWords, descriptions):
    no_punct = remove_punctuation(descriptions)
    no_punct_sw = remove_stopwords(stopWords, no_punct)
    cleaned = lemmatize_descriptions(no_punct_sw)
    return cleaned

def get_representative_jobs(df, kmeans):
    cluster_centers = kmeans.cluster_centers_
    for cent in cluster_centers:
        print('\nCluster Represnetations')
        dist = euclidean_distances(cent.reshape(1,-1), tfidf)
        order = np.argsort(dist)
        for o in order[0][:5]:
            title = df['job_title'].iloc[o]
            print(title)

In [11]:


if __name__ == '__main__':
    import os
    print("curr path is:")
    print(os.getcwd())
    


    # Reading in data
    df = pd.read_csv('../data/Cleaned_data_Set_5_jobs')
    df=df.drop(columns = "location")
#     print(df['job_desc'])
    descriptions = df['job_desc'].values

    # Creating stop words
    stopWords = set(stopwords.words('english'))
    add_stopwords = {
        'join', 'work', 'team', 'future', 'digital', 'technology', 'access', 'leader', 'industry', 'history', 'innovation',
        'year', 'customer', 'focused', 'leading', 'business', 'ability', 'country', 'employee', 'www', 'seeking',
        'location', 'role', 'responsible', 'designing', 'code', 'ideal', 'candidate', 'also', 'duty', 'without', 'excellent',
        'set', 'area', 'well', 'use', 'strong', 'self', 'help', 'diverse', 'every', 'day', 'equal', 'employment', 'opportunity',
        'affirmative', 'action', 'employer', 'diversity', 'qualified', 'applicant', 'receive', 'consideration', 'regard',
        'race', 'color', 'religion', 'sex', 'national', 'origin', 'status', 'age', 'sexual', 'orientation', 'gender',
        'identity', 'disability', 'marital', 'family', 'medical', 'protected', 'veteran', 'reasonable', 'accomodation',
        'protect', 'status', 'equal', 'discriminate', 'inclusive', 'diverse'
    }
    
    stopWords = stopWords.union(add_stopwords)

    # Initializing punctuation remover and lemmatizer
    tokenize_remove_punct = RegexpTokenizer(r'\w+')
    lemma = WordNetLemmatizer()

    # Cleaning descriptions for both the whole dataset and CO only
    cleaned_descriptions = clean_descriptions(stopWords, descriptions)

    # Vectorizing words creating both tf and tf-idf matrices
    tfidf_vectorizer = TfidfVectorizer(stop_words=stopWords, min_df=.15, max_df=0.75, max_features=5000)
    tfidf = tfidf_vectorizer.fit_transform(cleaned_descriptions).toarray()


curr path is:
/home/aqeelali7/Documents/Galvanize/Capstone-3-ATS/The-Right-Resume/src


In [12]:

    # Initializing and fitting k-means model
    kmeans = KMeans(n_clusters=6, n_jobs=-1)
    kmeans.fit(tfidf)

    # Returning most representative words for each cluster
    get_representative_jobs(df, kmeans)

    # Calculating model score for kmeans
    silhouette_score(tfidf, kmeans.labels_)
    kmeans.score(tfidf)
    
    #Visualizing k-means clusters with PCA graph
    kmeans_model = kmeans
    labels=kmeans_model.labels_.tolist()

    pca = PCA(n_components=2).fit(tfidf)
    datapoint = pca.transform(tfidf)





Cluster Represnetations
physician
physician
physician
physician
chemical+engineer

Cluster Represnetations
physician
physician
physician
physician
physician

Cluster Represnetations
financial+analyst
financial+analyst
financial+analyst
financial+analyst
financial+analyst

Cluster Represnetations
physician
chemical+engineer
physician
chemical+engineer
chemical+engineer

Cluster Represnetations
chemical+engineer
physician
chemical+engineer
physician
chemical+engineer

Cluster Represnetations
physician
physician
chemical+engineer
physician
chemical+engineer


In [13]:

    plt.figure

    label1 = ["#FFFF00", "#008000", "#0000FF", "#FF0000","#33fff6"]
    color = [label1[i] for i in labels]
    plt.scatter(datapoint[:, 0], datapoint[:, 1], c=color)
    centroids = kmeans_model.cluster_centers_
    centroidpoint = pca.transform(centroids)
    plt.scatter(centroidpoint[:,0], centroidpoint[:,1], marker='^', s=150, c="#000000", label='Cluster Centers')
    plt.xlabel('First PCA Dimension')
    plt.ylabel('Second PCA Dimension')
    plt.title('K-Means Clusters')
    plt.legend(fontsize='x-small')
    plt.text(0.44,0.6, 'Blue: Mobile devs', fontsize=9)
    plt.text(0.44, 0.5, 'Yellow: Data science', fontsize=9)
    plt.text(0.44, 0.4, 'Green: Big data dev', fontsize=9)
    plt.text(0.44, 0.4, 'Red: Big data dev', fontsize=9)
    
    plt.tight_layout()
#     plt.savefig('../imgs/pca_kmeans_3_clusters.png');

IndexError: list index out of range


Given training on job descriptions, the goal is that the model can classify into these clusters (with percentage fit/matching) with NEW inputs. 

PCA/Cosine Similarity with eigenvectors. Model tells us % similarity.

 - go back and look at how features fit into the clusters
 
Find HTML template


### Run model (some sort of linear model)
#### Find Feature significance 

## LDA for Feature Importance



## Cosine Similarity Matching

In [45]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
count_matrix = cv.fit_transform(cleaned_descriptions)

In [54]:
resume = "AQEEL ALI Aqeelali0312@gmail.com | (408) 718-0712 | www.linkedin.com/in/aqeelali786 EDUCATION: California Polytechnic University, San Luis Obispo, CA B.S. Business Administration – Financial Management Concentration Minor in Psychology ● Honors: Principal’s List (3.5+ GPA for three consecutive academic terms) - Fall 2019 ● Relevant Coursework: Financial Engineering in Risk Management, Computer Applications in Finance, Advanced Corporate Finance Chartered Financial Analyst Level Two Candidate Exam Date: Nov 2021 ● Pursuing CFA designation by acquiring a wide breadth of portfolio management skills. ● Level One Exam passed on June 2019. WORK EXPERIENCE: Middle Market Portfolio Analyst – Comerica Bank, San Jose, CA Jul 2019 to Present ● Analyze employer’s Middle Market business for the California region through industry, financial, macroeconomic data and other supporting credit information concerning an applicant's credit requests. ● Identify key business and financial risks that may impact the repayment prospects by the borrower. ● Expertise in Salesforce Data Management and CRM software systems utilized while underwriting to 8-figure commercial banking facilities ranging from $5M to $100M loans and facilities. aggregate exposure of bank assets. ● Prepare, review and assess the creditworthiness of commercial loan originations and renewals by evaluating tax returns, spreads of financial statements, historical trends, rent rolls, leases, projections, management performance, industry reports, cash flow models, capital structure and collateral analysis and other relevant data to analyze portfolio companies’ repayment capacity. ● Ensure the integrity of performance data for clientele and prospects and maintain ongoing relationships with thecustodial partners banks, vendors, and internal groups. ● Offer insights into customer financial needs, including opportunities identified using Line of Business-approved relationship expansion tools. Contribute personal insights related to a loan structure's effectiveness to mitigate risks, appropriate to prevailing competitive market environment and Bank risk tolerances. ● Prepared & presented nation-wide internal quarterly Company Q1 & Q2 2020 earnings reports and portfolio updates (within a team of four). ● Undertook special project initiative while fluidly adapting self-starter work ethic to a remote work environment during the initial rollout of Federal Treasury Payroll Protection Program and reviewed numerous applicants’ eligibility & fund usage during COVID-19 global pandemic. Venture Capital Analyst Intern - LDR Ventures, San Luis Obispo, CA Jan 2019 to Apr 2019 ● Analyzed investment opportunities up to $1.5M, prepared fundraising pitches to external stakeholders and prospective investors, and identified potential risks for early stage portfolio companies ● Assisted in building pricing models to help companies launch multiple new product lines and conduct stress tests under varying scenario analyses. ● Oversaw a personally proposed initiative for a portfolio company’s marketing campaign across universities in California. LEADERSHIP & OTHER RELEVANT EXPERIENCE Banking Valuations, Investment Banking Society San Luis Obispo, CA Jan 2019 to Feb 2019 ● Took an extracurricular course which covered the three main methods of company valuations ● Competed in a Goldman Sachs case competition against over 20 teams to create a pitch deck and presentation for a real case study. Recommended a company’s IPO by analyzing their financial position, creating a pro forma financial model, computing value with several valuation methodologies and examining IPO market conditions Member - MacIntalkers of Toastmasters International Apple Cupertino, CA Jul 2018 to Feb 2020 ● Delivered five public speeches under the “Dynamic Leadership” Pathways project. ● Developed effective communication skills on a weekly basis. " 
lyft_job_desc = "Financial Analyst, Strategy Finance at Lyft San Francisco, CA At Lyft, our mission is to improve people’s lives with the world’s best transportation. To do this, we start with our own community by creating an open, inclusive, and diverse organization.  Lyft is hiring a Financial Analyst for its Strategy Finance Team. The candidate in this position will provide financial and analytical support to drive strategic decisions for the company and help prepare financial management reporting. As a Financial Analyst, you will work directly with stakeholders across Finance in forecasting, planning and reporting key metrics to senior leadership.  Responsibilities: Help in analyzing & modeling forecast trends for total company financials Assist in the preparation and analysis of consolidated P&L for actuals and forecasts, help the FP&A team on deliverables, ongoing variance analysis, and ad hoc modeling Help lead the FP&A team through weekly and monthly forecasting Assist in the quarterly and annual strategic planning process Collaborate with Investor Relations by analyzing relevant financial information in preparation for the earnings call and investor presentations Team up with Corporate Development to create Board of Directors financials Partner with FP&A, Accounting, Treasury, Tax, and HR to forecast centralized expenses Drive monthly and quarterly close activities for FP&A and support consolidated management reporting Partner with Accounting to manage close timelines, process and reporting Manage creation of internal executive reporting documents including board, close and other management presentations and workbooks Support initiatives to create process efficiencies & improvements within FP&A Experience: BA/BS with 3+ years of experience in financial planning and analytics (FP&A) in a rigorous environment Corporate Finance, forecasting, or consolidations experience is a plus Detail-oriented and organized self-starter with a drive to dig into complex problems Advanced Excel skills. Experience building complex formulas and manipulating large data sets Ability to work in a fast-paced, team-based environment with minimal supervision Research, quantitative and analytical skills Comfortable navigating through financial statements Ability to organize and track overlapping tasks and assignments, with frequent priority changes Strong interpersonal and communication skills, with the ability to communicate and influence effectively across various departments Benefits: Great medical, dental, and vision insurance options Mental health benefits In addition to 12 observed holidays, salaried team members have unlimited paid time off, hourly team members have 15 days paid time off 401(k) plan to help save for your future 18 weeks of paid parental leave. Biological, adoptive, and foster parents are all eligible Pre-tax commuter benefits Lyft Pink - Lyft team members get an exclusive opportunity to test new benefits of our Ridership Program " 

In [64]:
resume_count_matrix = cv.fit_transform([resume,lyft_job_desc])

In [65]:
from sklearn.metrics.pairwise import cosine_similarity

In [66]:
print(cosine_similarity(resume_count_matrix))

[[1.         0.65583356]
 [0.65583356 1.        ]]


In [67]:
matchPercentage =  round(cosine_similarity(resume_count_matrix)[0][1]*100,2)

In [68]:
matchPercentage

65.58

In [None]:
6 job titles [  DS_words   |  FA_words |  CE_words ]
input text   [ %fit        |           |           ]


In [None]:
FA_profile = "Finance" --> "financ"
IF name LIKE financ% 
