#### Imports

In [1]:
from MyFunctions import prettify
from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans #this is currently an arbitrary choice.
#More thought should be put into what model to use.
import numpy as np

#### Build the corpus
Here I am building the feature array that will be used to classify career paths.

In [2]:
numberOfSamples = 1000;
numberOfFeatures = 10
initialList = ['software_engineer','information_technology_specialist','data','software_developer','computer_scientist',
               'sales_manager','human_resource','business','executive','design']

In [3]:
corpus = []
for path in initialList:
    titlesList = prettify(path)
    for title in titlesList:
       corpus.append(title) 
print(corpus[0:30])
#The vectorizer makes an array of features (words) that are used in the corpus.
#The Job paths are then characterized using this array and classified.
vectorizer = CountVectorizer(min_df=0.01, max_df=0.75, max_features=numberOfFeatures)
vectorizer.fit_transform(corpus)

print("\nFeatures:")
for i in range(0,len(vectorizer.vocabulary_)):
    print(str(i) + ":\t" + vectorizer.get_feature_names()[i])

['software engineer', 'information technology specialist', 'data', 'software developer', 'computer scientist', 'sales manager', 'human resource', 'business', 'executive', 'design']

Features:
0:	business
1:	computer
2:	data
3:	design
4:	developer
5:	engineer
6:	executive
7:	human
8:	information
9:	software


#### Read in the file and process
here we read in the file line by line into a list of career paths

In [4]:
#Each career path becomes it own list. 
pathList = [] # The list of career paths
DataFile = open("/Users/gregorycolledge/gcolledge/gcolledge/MSD_6019/allPaths.txt");
for i in range(0,numberOfSamples):
    careerPath = DataFile.readline();
    careerPathList = prettify(careerPath);
    pathString = ""
    for title in careerPathList:
        pathString = pathString + " " + title
    pathList.append([pathString]);
DataFile.close();


Here I am turning the career path strings into feature vectors (vectorized rows) in preperation of using the clustering model.

In [5]:
vectorizedRows = []
for path in pathList:
    temp = vectorizer.transform(path)
    vectorizedRows.append(temp.toarray())

Here I am making the model. The vectorized rows list has to be turned to an array and reshaped.

In [6]:
print(np.array(vectorizedRows).shape)#shows number of rows used for clustering, number of samples per line, number of features per sample.
print(np.array(vectorizedRows).reshape(numberOfSamples,numberOfFeatures).shape) #number of samples, number of features
kmeans = KMeans(n_clusters=3, random_state=4)
kmeans.fit(np.array(vectorizedRows).reshape(numberOfSamples,numberOfFeatures))

(1000, 1, 10)
(1000, 10)


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=3, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=4, tol=0.0001, verbose=0)

In [7]:
clusterPoints = [];
for line in kmeans.cluster_centers_:
    newLine = [];
    for num in line:
        newLine.append(round(num));
    clusterPoints.append(newLine);
for each in clusterPoints:
    print(each)

[0.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0, -0.0, 0.0, 4.0]
[0.0, 0.0, 0.0, 1.0, 0.0, 4.0, 0.0, -0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]


In [8]:
test = vectorizer.transform(['bum','software engineer','program manager',' administrative professional maternity and family leave customer service associate patient services coordinator clinical admin team lead credentialing specialist managed care credentialing specialist credentialing verification office',' software developer intern teaching assistant software engineer software engineer software engineer','senior technical recruiter', 'software engineer software engineer software engineer software engineer software engineer software engineer software engineer software engineer software engineer software engineer software engineer software engineer software engineer software engineer', 'computer programmer', 'technical program manager', 'machine leraning specialist and data analyst'])
result = kmeans.predict(np.array(test.toarray()))
print(result)

[2 2 2 2 0 2 0 2 2 2]
