In [167]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import label_binarize
from nltk.tokenize import TweetTokenizer, RegexpTokenizer
col = ['EVENT TITLE', 'EVENT DOMAIN']
data = pd.read_csv("event_data.csv");
training_data = pd.DataFrame(data,columns = col)
training_data.head()

Unnamed: 0,EVENT TITLE,EVENT DOMAIN
0,Times Higher Education Regional Academic Semin...,Higher Education
1,Leadership Seminar by XYZ group,
2,"Seminar on Software Applications, Applied Scie...",Other
3,10th Annual National Expo on Artificial Intell...,Artificial Intelligence
4,Webinar on higher education,Higher Education


In [168]:
from nltk.stem import PorterStemmer
porter = PorterStemmer()

def make_tokens(text):
    return text.split(' ')
        
def process_text(text):
    tokenized = make_tokens(text)
    lower = [item.lower() for item in tokenized]
    lemma = [porter.stem(x) for x in lower]
    return lemma

#preprocess data
for line in training_data['EVENT TITLE']:
    #print(line)
    training_data['CONTENT'] = training_data['EVENT TITLE'].apply(process_text)
training_data.head()

Unnamed: 0,EVENT TITLE,EVENT DOMAIN,CONTENT
0,Times Higher Education Regional Academic Semin...,Higher Education,"[time, higher, educ, region, academ, seminar, ..."
1,Leadership Seminar by XYZ group,,"[leadership, seminar, by, xyz, group]"
2,"Seminar on Software Applications, Applied Scie...",Other,"[seminar, on, softwar, applications,, appli, s..."
3,10th Annual National Expo on Artificial Intell...,Artificial Intelligence,"[10th, annual, nation, expo, on, artifici, int..."
4,Webinar on higher education,Higher Education,"[webinar, on, higher, educ]"


#Stemming words
from nltk.stem import PorterStemmer
porter = PorterStemmer()
print(porter.stem("application"))



In [169]:
#find word frequency in all articles
from collections import Counter
texts = training_data['CONTENT']

def word_count(texts):
    flat = [item for sublist in texts for item in sublist]
    with_counts = Counter(flat)
    top = with_counts.most_common()
    word = [each[0] for each in top]
    num = [each[1] for each in top]
    return pd.DataFrame([word, num]).T
    
#type(texts)
w_count = word_count(texts)


In [170]:
process_label = [item.lower() for item in training_data['EVENT DOMAIN']]

print(process_label)
#label numbers
LE = LabelEncoder()
training_data['label_num'] = LE.fit_transform(process_label)

#display(training_data.groupby(['EVENT DOMAIN'])['EVENT TITLE'].count())
#display(df_holdout.groupby(['label'])['content'].count())
display(training_data['EVENT DOMAIN'].unique())
training_data['label_num'].unique()

['higher education', 'none', 'other', 'artificial intelligence', 'higher education', 'management', 'artificial intelligence', 'none', 'iot', 'data science', 'coding', 'networking', 'mobile application', 'coding', 'higher education', 'c++', 'none', 'none', 'none', 'other', 'iot', 'cloud computing', 'artificial intelligence', 'web development', 'none', 'none', 'coding', 'blockchain', 'other', 'development', 'none', 'mobile application', 'python', 'none', 'other', 'management', 'networking', 'python', 'java', 'web development', 'iot', 'security', 'hardware', 'cloud computing', 'mobile application', 'security', 'higher education', 'networking', 'data science', 'iot', 'artificial intelligence', 'machine learning', 'higher education', 'blockchain', 'management', 'none', 'coding', 'other', 'other', 'security', 'coding', 'coding', 'coding', 'iot', 'higher education', 'machine learning', 'artificial intelligence', 'none', 'hardware', 'finance', 'web development', 'web development', 'mobile appl

array(['Higher Education', 'None', 'Other', 'Artificial Intelligence',
       'Management', 'IoT', 'Data science', 'Coding', 'networking',
       'mobile application', 'C++', 'Cloud computing', 'Web development',
       'none', 'Blockchain', 'Development', 'Python', 'other',
       'Networking', 'Java', 'Web Development', 'security', 'Hardware',
       'cloud computing', 'Data Science', 'Machine Learning', 'Security',
       'Finance', 'Mobile Applications', 'Cloud Computing',
       'Software Architecture', 'JavaScript', 'Development Processes',
       'C'], dtype=object)

array([11, 20, 21,  0, 16, 12,  6,  5, 19, 17,  3,  4, 25,  1,  7, 22, 13,
       23, 10, 15,  9, 18, 24, 14,  8,  2], dtype=int64)

In [171]:
#from gensim.models import Phrases

#split training data into categories
train_0 = training_data.loc[training_data['label_num'] == 0]
train_1 = training_data.loc[training_data['label_num'] == 1]
train_2 = training_data.loc[training_data['label_num'] == 2]
train_3 = training_data.loc[training_data['label_num'] == 3]
train_4 = training_data.loc[training_data['label_num'] == 4]
train_5 = training_data.loc[training_data['label_num'] == 5]
train_6 = training_data.loc[training_data['label_num'] == 6]
train_7 = training_data.loc[training_data['label_num'] == 7]
train_8 = training_data.loc[training_data['label_num'] == 8]
train_9 = training_data.loc[training_data['label_num'] == 9]
train_10 = training_data.loc[training_data['label_num'] == 10]
train_11 = training_data.loc[training_data['label_num'] == 11]
train_12 = training_data.loc[training_data['label_num'] == 12]

#hold 5 train examples for prediction later
train_0_hold = train_0.iloc[:5]
train_1_hold = train_1.iloc[:5]
train_2_hold = train_2.iloc[:5]
train_3_hold = train_3.iloc[:5]
train_4_hold = train_4.iloc[:5]
train_5_hold = train_5.iloc[:5]
train_6_hold = train_6.iloc[:5]
train_7_hold = train_7.iloc[:5]
train_8_hold = train_8.iloc[:5]
train_9_hold = train_9.iloc[:5]
train_10_hold = train_10.iloc[:5]
train_11_hold = train_11.iloc[:5]
train_12_hold = train_12.iloc[:5]

#---------------------------

train_0 = train_0.iloc[5:]
train_1 = train_1.iloc[5:]
train_2 = train_2.iloc[5:]
train_3 = train_3.iloc[5:]
train_4 = train_4.iloc[5:]
train_5 = train_5.iloc[5:]
train_6 = train_6.iloc[5:]
train_7 = train_7.iloc[5:]
train_8 = train_8.iloc[5:]
train_9 = train_9.iloc[5:]
train_10 = train_10.iloc[5:]
train_11 = train_11.iloc[5:]
train_12 = train_12.iloc[5:]



df = pd.concat([train_0,train_1,train_2])
print(df)

#---------------------------

#considering bigrams
text_0 = train_0['CONTENT'].tolist()
text_1 = train_1['CONTENT'].tolist()
text_2 = train_2['CONTENT'].tolist()
text_3 = train_3['CONTENT'].tolist()
text_4 = train_4['CONTENT'].tolist()
text_5 = train_5['CONTENT'].tolist()
text_6 = train_6['CONTENT'].tolist()
text_7 = train_7['CONTENT'].tolist()
text_8 = train_8['CONTENT'].tolist()
text_9 = train_9['CONTENT'].tolist()
text_10 = train_10['CONTENT'].tolist()
text_11 = train_11['CONTENT'].tolist()
text_12 = train_12['CONTENT'].tolist()

#text_0

                                           EVENT TITLE  \
89   Get insights into Advanced Artificial Intellig...   
131                  Artificial Intelligence hackathon   
156          A talk session on Artificial Intelligence   
148         A Job opening in blockchain in our company   
167                       A talk session on Blockchain   
171                   A hands-on session on blockchain   
172                   A hands-on session on blockchain   
190  This is to notify the employees about the bloc...   
194                    complete all blockchain courses   

                EVENT DOMAIN  \
89   Artificial Intelligence   
131  Artificial Intelligence   
156  Artificial Intelligence   
148               Blockchain   
167               Blockchain   
171               Blockchain   
172               Blockchain   
190               Blockchain   
194               Blockchain   

                                               CONTENT  label_num  
89   [get, insight, into, adva

from nltk import word_tokenize
from nltk.util import ngrams
for i in range(len(text_0)):
    token = nltk.word_tokenize(text_0[i])
    bigram = list(ngrams(token, 2))
    print(bigram)


In [140]:
import itertools
def ngrams_wrapper(sent):
    return list(nltk.ngrams(sent, 2))
def b_freq(text):
    bigrams = map(ngrams_wrapper, text)
    bigram = list(itertools.chain.from_iterable(bigrams))
    freq_dist = nltk.FreqDist(bigram)
    column=["count"]
    return pd.DataFrame(freq_dist,column).T
    #prob_dist = nltk.MLEProbDist(freq_dist)
    #number_of_bigrams = freq_dist.N()
    #number_of_bigrams
    


In [141]:
#bigram count
b0 = b_freq(text_0)
b1 = b_freq(text_1)
b2 = b_freq(text_2)
b3 = b_freq(text_3)
b4 = b_freq(text_4)
b5 = b_freq(text_5)
b6 = b_freq(text_6)
b7 = b_freq(text_7)
b8 = b_freq(text_8)
b9 = b_freq(text_9)
b10 = b_freq(text_10)
b11 = b_freq(text_11)
b12 = b_freq(text_12)
#b0['artifici','intellig']
#b_word_freq = pd.concat([b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,b10,b11,b12], axis = 1)
#b_word_freq = pd.concat([b0,b1], axis = 1)
#column = ['0','1']
#b_word_freq.columns = column
#b_word_freq
b6

Unnamed: 0,Unnamed: 1,count
announc,skill,1
skill,develop,1
develop,and,1
and,comput,1
comput,scienc,1
scienc,cours,1


In [142]:
#make features
text = training_data['CONTENT'].astype('str')

tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), 
                                   min_df = 2, 
                                   max_df = .95)

X = tfidf_vectorizer.fit_transform(text) #features
y = training_data['label_num'].values #target

print (X.shape)
print(y.shape)

(54, 122)
(54,)


In [145]:
#Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score,recall_score,f1_score

#Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, shuffle = True, random_state = 3)

#Fit model
model = RandomForestClassifier(random_state=3).fit(X_train,y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
accuracy



0.36363636363636365

In [147]:
#Grid Search
bootstrap = [True, False]
max_depth = [10, 50, 100, None]
max_features = ['auto', 'sqrt']
min_samples_leaf = [1, 2, 4]
min_samples_split = [2, 5, 10]
n_estimators = [800, 1400, 2000]
random_state = [3]

clf = RandomForestClassifier()

params = dict(bootstrap = bootstrap,
              max_depth = max_depth,
              max_features = max_features,
              min_samples_leaf = min_samples_leaf,
              n_estimators = n_estimators,
              random_state=random_state)

gridsearch = GridSearchCV(clf,
                          params, 
                          cv=5,
                          verbose=1, 
                          n_jobs=-1)

rf_best_model = gridsearch.fit(X, y)
pred = rf_best_model.predict(X_test)
accuracy = accuracy_score(y_test,pred)
accuracy
#rf_best_model = RandomForestClassifier(bootstrap = False,
#                                       max_depth = 50,
#                                       max_features = 'auto',
#                                       min_samples_leaf = 1,
#                                       n_estimators = 1400,
#                                       random_state=3)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   20.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed:  5.4min finished


1.0

In [163]:
#TESTING 

d = pd.read_csv("test.csv");
test = pd.DataFrame(d).T
#test
X_unseen_tfidf = tfidf_vectorizer.transform(test)
#p = tfidf_vectorizer.fit_transform(test)
#p = rf_best_model.predict(z)
#p

ValueError: Found array with 0 sample(s) (shape=(0, 122)) while a minimum of 1 is required.