In [73]:
import numpy as np
import pandas as pd
from pprint import pprint
from time import time

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
#from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

import csv

In [50]:
data = pd.read_csv('final_data.csv')
data

Unnamed: 0,EVENT TITLE,EVENT DOMAIN,EVENT TYPE
0,Times Higher Education Regional Academic Semin...,Higher Education,Seminar
1,Leadership Seminar by XYZ group,,Seminar
2,"Seminar on Software Applications, Applied Scie...",Other,Seminar
3,10th Annual National Expo on Artificial Intell...,Artificial Intelligence,Expo
4,Webinar on higher education,Higher Education,Webinar
...,...,...,...
209,Internship opportunity in data science,Data Science,Internship
210,Internship opportunity in security,Security,Internship
211,Internship opportunity in networking,Networking,Internship
212,Internship opportunity in Cloud computing,Cloud Computing,Internship


In [51]:
data['EVENT DOMAIN'].value_counts()

Management                 18
None                       18
Other                      16
Security                   15
Cloud Computing            13
Higher Education           12
Blockchain                 11
Networking                 11
Mobile Application         10
IoT                        10
Machine Learning            9
Coding                      9
Software Architecture       9
Artificial Intelligence     8
C++                         7
Data Science                6
Web Development             6
Java                        4
Development Processes       4
Hardware                    4
Python                      4
C                           3
Finance                     3
Data science                1
Cloud computing             1
JavaScript                  1
Development                 1
Name: EVENT DOMAIN, dtype: int64

In [52]:
def pre_process_text(textArray):
    wnl = WordNetLemmatizer()
    processed_text = []
    for text in textArray:
        words_list = (str(text).lower()).split()
        final_words = [wnl.lemmatize(word) for word in words_list if word not in stopwords.words('english')]
        #If using stemming...
        #final_words = [stemmer.stem(word) for word in words_list if word not in stopwords.words('english')]
        final_words_str = str((" ".join(final_words)))
        processed_text.append(final_words_str)
    return processed_text

data['EVENT TITLE'] = pre_process_text(data['EVENT TITLE'])
#data['EVENT TITLE']


In [19]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aakan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [21]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aakan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [53]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier()),
]);


In [60]:
parameters = {
    'vect__max_df': (0.5, 1.0),#0.6, 0.7, 0.8, 0.9, 1.0),
    'vect__max_features': (None, 1000, 5000),#2000, 3000, 4000, 5000, 6000, 10000, 20000, 30000, 40000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),#, (1, 3)),  # unigrams or bigrams or trigrams
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    #'clf__alpha': (0.1, 0.01, 0.001),#, 0.0001, 0.00001, 0.000001, 0.0000001),
    #'clf__penalty': ('l2', 'elasticnet'),
    #'clf__n_iter': (10, 50)#, 100, 200, 300, 400, 500, 100),
}

In [61]:
types = ['Internships','Courses','Fests','Competitions','Jobs','Seminars','Expos','Certifications','Hackathons',
              'Talks','Webinars','Trainings','Workshops']
domains = ['Security','Other','Cloud Computing','Management','Coding','Python','Mobile Applications',
         'IoT','Java','Finance','Networking','Blockchain','C++','Development Processess',
         'C','Machine Learning','Web Development','Higher Education',
         'Software Architecture','Javascript','Data Science','Hardware','Artificial Intelligence']


In [62]:
#grid search for event type
grid_search_type = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, refit=True)

print("Grid Search started\n---------------------------------------")
print("Pipeline:", [name for name, _ in pipeline.steps])
print("Grid Search Parameters:")
pprint(parameters)
t0 = time()
grid_search_type.fit(np.array(data['EVENT TITLE']), np.array(data['EVENT TYPE']))
print("done in %0.3fs\n----------------------------------------------" % (time() - t0))

print("Best Score: %0.3f\n-------------------------------------------" % grid_search_type.best_score_)
print("Best Parameters:")
best_parameters = grid_search_type.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
    




Grid Search started
---------------------------------------
Pipeline: ['vect', 'tfidf', 'clf']
Grid Search Parameters:
{'tfidf__norm': ('l1', 'l2'),
 'tfidf__use_idf': (True, False),
 'vect__max_df': (0.5, 1.0),
 'vect__max_features': (None, 1000, 5000),
 'vect__ngram_range': ((1, 1), (1, 2))}
Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.5s


done in 4.490s
----------------------------------------------
Best Score: 0.855
-------------------------------------------
Best Parameters:
	tfidf__norm: 'l1'
	tfidf__use_idf: False
	vect__max_df: 0.5
	vect__max_features: 5000
	vect__ngram_range: (1, 1)


[Parallel(n_jobs=-1)]: Done 129 out of 144 | elapsed:    4.3s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:    4.3s finished


In [63]:
#grid search for event domain
grid_search_domain = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, refit=True)

print("Grid Search started\n---------------------------------------")
print("Pipeline:", [name for name, _ in pipeline.steps])
print("Grid Search Parameters:")
print(parameters)
t0 = time()
grid_search_domain.fit(np.array(data['EVENT TITLE']), np.array(data['EVENT DOMAIN']))
print("done in %0.3fs\n----------------------------------------------" % (time() - t0))

print("Best Score: %0.3f\n-------------------------------------------" % grid_search_domain.best_score_)
print("Best Parameters:")
best_parameters = grid_search_domain.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Grid Search started
---------------------------------------
Pipeline: ['vect', 'tfidf', 'clf']
Grid Search Parameters:
{'vect__max_df': (0.5, 1.0), 'vect__max_features': (None, 1000, 5000), 'vect__ngram_range': ((1, 1), (1, 2)), 'tfidf__use_idf': (True, False), 'tfidf__norm': ('l1', 'l2')}
Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


done in 1.218s
----------------------------------------------
Best Score: 0.813
-------------------------------------------
Best Parameters:
	tfidf__norm: 'l2'
	tfidf__use_idf: False
	vect__max_df: 0.5
	vect__max_features: None
	vect__ngram_range: (1, 1)


[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:    1.1s finished


In [68]:
from sklearn.externals import joblib
joblib.dump(grid_search_domain.best_estimator_,"model_domain.pkl")
joblib.dump(grid_search_type.best_estimator_,"model_type.pkl")



['model_type.pkl']

In [70]:
test_set = np.array([
    'join the workshop Machine learning',
    'compete coding',
    'get coaching now',
    'complete the course',
    'looking for an internship',
    'RE: join the fest now',
    'join blockchain certification course now'
])

predicted_type=(grid_search_type.best_estimator_.predict(np.array(test_set)))
predicted_domain=(grid_search_domain.best_estimator_.predict(np.array(test_set)))

print(predicted_type)
print(predicted_domain)

['Workshop' 'Webinar' 'Talk' 'Courses' 'Internship' 'Fest' 'Courses']
['Machine Learning' 'Coding' 'None' 'C++' 'None' 'None' 'Blockchain']


In [74]:
emp = pd.read_csv('CCMLEmployeeData.csv')

empName=np.array(emp['Name'])
empDomain=np.array(emp['Domain'])
empEvent1=np.array(emp['Event1'])
empEvent2=np.array(emp['Event2'])
empData=[]
for i in range(empName.shape[0]):
    empData.append([empName[i],empDomain[i],empEvent1[i],empEvent2[i]])

TypeDomainMatchOutput=[[] for i in range(test_set.shape[0])]
OnlyTypeMatchOutput=[[] for i in range(test_set.shape[0])]

MatchedEmployees=["" for i in range(test_set.shape[0])]
for i in range(0,test_set.shape[0]):
    for j in range(0,len(empData)):
        if(predicted_type[i] in empData[j] and predicted_domain[i] in empData[j]):
            TypeDomainMatchOutput[i].append(empData[j][0])
        elif(predicted_type[i] in empData[j]):
            OnlyTypeMatchOutput[i].append(empData[j][0])
            MatchedEmployees[i]=MatchedEmployees[i]+','+empData[j][0]

myData=[["Subject","Employee Names"]]

for i in range(0,test_set.shape[0]):
    print(test_set[i],":",OnlyTypeMatchOutput[i])
    myData.append([test_set[i],MatchedEmployees[i]])

myFile = open('ClassifierOuput.csv', 'w')
with myFile:
    writer = csv.writer(myFile)
    writer.writerows(myData)

print("Writing complete")


join the workshop Machine learning : []
compete coding : []
get coaching now : []
complete the course : ['Curtis Ortega', 'David Foster', 'Christine Potter', 'Julian Sanders', 'Melissa Brown', 'Javier Leblanc', 'Brittney Copeland', 'Michelle Miller', 'Daniel Orozco', 'Michele Campbell', 'Michael Alvarado', 'Peter Wood', 'Dennis Ramirez', 'Mrs. Alexa Henson MD', 'Tracy Mejia', 'Edwin Bowman', 'Michael West', 'Jennifer Merritt', 'Katherine Gonzalez', 'Robert Ramirez', 'Julia Park', 'Jason Anthony', 'Katelyn Barnes']
looking for an internship : []
RE: join the fest now : []
join blockchain certification course now : ['Curtis Ortega', 'David Foster', 'Christine Potter', 'Julian Sanders', 'Melissa Brown', 'Javier Leblanc', 'Brittney Copeland', 'Michelle Miller', 'Daniel Orozco', 'Michele Campbell', 'Michael Alvarado', 'Peter Wood', 'John Pearson', 'Dennis Ramirez', 'Tracy Mejia', 'Edwin Bowman', 'Michael West', 'Jennifer Merritt', 'Katherine Gonzalez', 'Robert Ramirez', 'Michael Rowe', 'Jas