In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_validate

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer



import re

import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.figsize'] = (8, 8)
plt.rcParams['font.size'] = 17



import warnings
warnings.filterwarnings("ignore")
sns.set(style="ticks", color_codes=True)
%matplotlib inline

In [2]:
data = pd.read_csv(f'../app/data/processed_data.csv')


In [3]:
data.text[0]

"job descriptiondata scientist, marketingsan diego, ca /analytics – data science /full-timeheadquartered in san diego, we serve as a leading provider of working capital ($5k - $1.5m) to the small and medium-sized businesses that fuel our country. since 2008, we have prided ourselves on our collaborative, innovative, and customer-focused approach. enjoying a period of unprecedented growth, driven by the combination of cutting-edge technology, human touch, and unwavering integrity, we are looking to add to our people-first culture, with highly motivated and results-oriented professionals, to push the limits of what's possible while creating value for all of our partners.we are seeking a mid-level to senior level statistician, quantitative modeling specialist, or data scientist to join our analytics team and build predictive models for marketing. if you have exceptional analytical, quantitative and problem-solving skills, demonstrated experience designing and implementing predictive model

In [4]:
def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned


In [5]:
def clean_text(data):
    sentences = data.split('.')
    clean_sentences = []
    for i in sentences:
        clean_sentence = re.sub(r'[?|!|\'|"|#|/|-|,(|)|$|-|' '|:]',r'',i)
        clean_sentence = re.sub(r'[?|!|\'|"|#|/|-|,(|)|$|(0-9)]',r'',clean_sentence.strip(' '))
        if len(clean_sentence.strip()) > 1:
            clean_sentences.append(clean_sentence)
    done_sent = ''
    for i in clean_sentences:
        done_sent += (' '+i)
    return done_sent.strip()
    

data['cleaned'] = data.text.apply(clean_text)

In [6]:
data.cleaned[0]

'job descriptiondata scientist marketingsan diego ca analytics – data science full-timeheadquartered in san diego we serve as a leading provider of working capital k -  m to the small and medium-sized businesses that fuel our country since  we have prided ourselves on our collaborative innovative and customer-focused approach enjoying a period of unprecedented growth driven by the combination of cutting-edge technology human touch and unwavering integrity we are looking to add to our people-first culture with highly motivated and results-oriented professionals to push the limits of whats possible while creating value for all of our partners we are seeking a mid-level to senior level statistician quantitative modeling specialist or data scientist to join our analytics team and build predictive models for marketing if you have exceptional analytical quantitative and problem-solving skills demonstrated experience designing and implementing predictive models and analytics in marketing a pr

In [7]:
def clean_text(data):
    sentences = data.split('.')
    clean_sentences = []
    for i in sentences:
        clean_sentence = re.sub(r'[?|!|\'|"|#|/|-|,(|)|$|-|' '|:]',r'',i)
        clean_sentence = re.sub(r'[?|!|\'|"|#|/|-|,(|)|$|-|' '|:]',r'',clean_sentence)
        clean_sentence = " ".join(re.findall("[(a-zA-Z,&)]+", clean_sentence))
        clean_sentences.append(clean_sentence)
    clean_text = ''
    for i in clean_sentences:
        clean_text += (' '+i)
    return clean_text.strip(' ')

    

data['cleaned'] = data.text.apply(clean_text)



In [8]:
data.cleaned[3]

'li remote about eab at eab our mission is to make education smarter and our communities stronger we work with more than institutions to drive transformative change through data driven insights and best in class capabilities from kindergarten to college to career eab partners with leaders and practitioners to accelerate progress and drive results across five major areas enrollment student success institutional strategy data & analytics and diversity equity and inclusion de&i we work with each partner differently tailoring our portfolio of research technology and marketing and enrollment solutions to meet the unique needs of every leadership team as well as the students and employees they serve at eab we serve not only our partner institutions but each other thats why we are always working to make sure our employees love their jobs and are invested in their communities see how weve been recognized for this dedication to our employees by checking out our recent awards for more informatio

### 4.1 Target
Before I can begin splitting the data I need to set the target for my methodology of training four seperate logistic regression models. I'm doing this because I'd like my classifications to be as accurate as possible, and also, by building my NLP strategy around a particular label, i.e. finding common words for that label as opposed to being generalized through the entire corpus\

I'm going to one-hot-encode the target feature so I can select each of the next columns as my y - one for each model.

In [9]:
data = data[['company','rating','job_title','state','city','cleaned','target']]

In [10]:
ohe = OneHotEncoder(sparse=False, dtype='int')
targets = ohe.fit_transform(pd.DataFrame(data.target))


In [11]:
targets = pd.DataFrame(targets,columns=['Q1','Q2','Q3','Q4','unk'])


In [12]:
data = data.join(targets)
data.head()

Unnamed: 0,company,rating,job_title,state,city,cleaned,target,Q1,Q2,Q3,Q4,unk
0,online technical services,3.7,data scientist - marketing,remote,remote,job descriptiondata scientist marketingsan die...,4.0,0,0,0,1,0
1,west cap,3.5,"data scientist, botguard",ny,remote in new york,human was founded in in a brooklyn sci fi book...,2.0,0,1,0,0,0
2,techtrueup,3.8,mcs data scientist,remote,remote,description data scientist fully remote develo...,3.0,0,0,1,0,0
3,eab,3.7,associate data scientist,dc,remote in washington,li remote about eab at eab our mission is to m...,1.0,1,0,0,0,0
4,redfin,3.4,senior data analyst - tour support (remote eli...,remote,remote,this position is a remote eligible position yo...,2.0,0,1,0,0,0


In [13]:
data.drop(['target','unk'], axis=1,inplace=True)

In [14]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence


data['comment_text'] = data['cleaned'].apply(stemming)

In [15]:
data.comment_text[0]

'job descriptiondata scientist marketingsan diego ca analyt data scienc full timeheadquart in san diego we serv as a lead provid of work capit k m to the small and medium size busi that fuel our countri sinc we have pride ourselv on our collabor innov and custom focus approach enjoy a period of unpreced growth driven by the combin of cut edg technolog human touch and unwav integr we are look to add to our peopl first cultur with high motiv and result orient profession to push the limit of what possibl while creat valu for all of our partner we are seek a mid level to senior level statistician quantit model specialist or data scientist to join our analyt team and build predict model for market if you have except analyt quantit and problem solv skill demonstr experi design and implement predict model and analyt in market a proven track record of bring thought leadership to problem and the desir to make a rapid impact on the success of the busi this is an opportun for you the ideal candid

In [16]:
from nltk.stem import WordNetLemmatizer



lemmatizer = WordNetLemmatizer()
def stemming(sentence):
    LemSentence = ""
    for word in sentence.split():
        stem = lemmatizer.lemmatize(word)
        LemSentence += stem
        LemSentence += " "
    LemSentence = LemSentence.strip()
    return LemSentence


data['comment_text_lem'] = data['cleaned'].apply(stemming)
data.comment_text_lem[0]

'job descriptiondata scientist marketingsan diego ca analytics data science full timeheadquartered in san diego we serve a a leading provider of working capital k m to the small and medium sized business that fuel our country since we have prided ourselves on our collaborative innovative and customer focused approach enjoying a period of unprecedented growth driven by the combination of cutting edge technology human touch and unwavering integrity we are looking to add to our people first culture with highly motivated and result oriented professional to push the limit of whats possible while creating value for all of our partner we are seeking a mid level to senior level statistician quantitative modeling specialist or data scientist to join our analytics team and build predictive model for marketing if you have exceptional analytical quantitative and problem solving skill demonstrated experience designing and implementing predictive model and analytics in marketing a proven track recor

In [17]:
data.drop(['cleaned','comment_text'], axis=1,inplace=True)

In [18]:
q1_data = data.drop(['Q2','Q3','Q4'], axis=1)
q2_data = data.drop(['Q1','Q3','Q4'], axis=1)
q3_data = data.drop(['Q1','Q2','Q4'], axis=1)
q4_data = data.drop(['Q1','Q2','Q3'], axis=1)

In [19]:
X = q1_data.drop(['Q1'], axis=1)
y = q1_data['Q1']
x_train, x_test, y_train, y_test = train_test_split(X, y , test_size=.2, random_state=42)

In [20]:
x_train

Unnamed: 0,company,rating,job_title,state,city,comment_text_lem
995,spotify,4.3,"data scientist, advertising economics",ny,remote in new york,data research & insightsdata scienceat spotify...
507,s&p global,3.9,data scientist,tn,remote in nashville,segment market intelligence the role data scie...
334,cybercoders,3.7,remote senior data analyst,ca,remote in san francisco,remote senior data analyst if you are a senior...
848,galaxe.solutions,2.5,data analyst,wi,remote in milwaukee,what you will dowell acquainted with the dba r...
294,galaxe.solutions,2.5,data analyst,wi,remote in milwaukee,what you will dowell acquainted with the dba r...
...,...,...,...,...,...,...
87,online technical services,3.7,data scientist - marketing,remote,remote,job descriptiondata scientist marketingsan die...
330,cybercoders,3.7,principal data scientist,wa,remote in seattle,principal data scientist if you are a principa...
466,sparkcognition,4.4,machine learning engineer,tx,remote in austin,voted best place to work in austin best paying...
121,amadeus,3.9,principal data scientist - network planning fo...,remote,remote,summary of the rolewhere to fly when to fly wh...


In [21]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 824 entries, 995 to 860
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   company           824 non-null    object 
 1   rating            824 non-null    float64
 2   job_title         824 non-null    object 
 3   state             824 non-null    object 
 4   city              824 non-null    object 
 5   comment_text_lem  824 non-null    object 
dtypes: float64(1), object(5)
memory usage: 45.1+ KB


In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer


vectorizer = TfidfVectorizer()
corpus = x_train.comment_text_lem
def vect(sentence):
    return vectorizer.fit_transform(corpus)


vectorized = pd.DataFrame(data['comment_text_lem'].apply(vect))




In [None]:
vectorizer = TfidfVectorizer()
corpus = x_train.comment_text_lem
def vect(sentence):
    vectorizer.fit_transform(corpus)
    return vectorizer.get_feature_names_out()
    
vectorized2 = pd.DataFrame(data['comment_text_lem'].apply(vect))

In [None]:
vectorized

In [None]:
vectorized2

In [25]:
x_train.text

995      (0, 3687)\t0.03732987068641745\n  (0, 2710)\...
507      (0, 3687)\t0.03732987068641745\n  (0, 2710)\...
334      (0, 3687)\t0.03732987068641745\n  (0, 2710)\...
848      (0, 3687)\t0.03732987068641745\n  (0, 2710)\...
294      (0, 3687)\t0.03732987068641745\n  (0, 2710)\...
                             ...                        
87       (0, 3687)\t0.03732987068641745\n  (0, 2710)\...
330      (0, 3687)\t0.03732987068641745\n  (0, 2710)\...
466      (0, 3687)\t0.03732987068641745\n  (0, 2710)\...
121      (0, 3687)\t0.03732987068641745\n  (0, 2710)\...
860      (0, 3687)\t0.03732987068641745\n  (0, 2710)\...
Name: text, Length: 824, dtype: object

In [None]:
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names_out()

In [None]:
#scaler_features = num_cols
nlp_cols = ['comment_text_lem']
le_cols = ['city','state','job_title','company']
scal_cols = ['rating']

one_hot_encoder = Pipeline(steps=[
    ('ohe', OneHotEncoder(drop='if_binary'))
])

vect = Pipeline(steps=[
    ('vect', TfidfVectorizer())
])

ord_enc = OrdinalEncoder()

label_encoder = Pipeline(steps=[
    ('label_enc', LabelEncoder())
])
scaler_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

minmax_scalar_transformer = Pipeline(steps=[
    ('scaler', MinMaxScaler())])


preprocessor = ColumnTransformer(
    transformers=[
        #('ord_cat', ordinal_cat_encoder, ordinal_cat_features),
        #('ord',ord_enc,le_cols),
        #('ohe', one_hot_encoder, nom_cols),
        #('binarize', one_hot_encoder, bin_cols),
        ('vect',vect,nlp_cols),
        #('label_enc', label_encoder, le_cols),
        #('scaler', scaler_transformer,scal_cols)
        #('minmax_scaler', minmax_scalar_transformer,scaler_features)
    ],remainder='drop'
)

transformer = Pipeline(steps=[('preprocessor', preprocessor)])


In [None]:
df_transformed = pd.DataFrame(transformer.fit_transform(x_train))
df_transformed

In [None]:
from sklearn.linear_model import LogisticRegression
pipe = Pipeline(
    steps=[
        ("preprocessor", transformer), 
        ("logreg", LogisticRegression())
        ]
    )

pipe.fit(x_train,y_train)

In [None]:
q1_x_train, q1_y_train, q1_x_train, q1_y_train, 
train, test = train_test_split(X,y, random_state=42, test_size=0.30, shuffle=True)

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')
vectorizer.fit(train_text)
vectorizer.fit(test_text)x_train = vectorizer.transform(train_text)
y_train = train.drop(labels = ['id','comment_text'], axis=1)x_test = vectorizer.transform(test_text)
y_test = test.drop(labels = ['id','comment_text'], axis=1)

In [None]:
# Using pipeline for applying logistic regression and one vs rest classifier
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1)),
            ])for category in categories:
    print('**Processing {} comments...**'.format(category))
    
    # Training logistic regression model on train data
    LogReg_pipeline.fit(x_train, train[category])
    
    # calculating test accuracy
    prediction = LogReg_pipeline.predict(x_test)
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))
    print("\n")

### 4.2 X, y, and Train/Test Split


In [None]:
X = data[bin_cols+nom_cols+num_cols]
y = data[target]

x_train, x_test, y_train, y_test = train_test_split(X, y , test_size=.2, random_state=42)

In [None]:
cols = ['company','job_title', 'state', 'city']

encoder = OneHotEncoder(handle_unknown='ignore')

#perform one-hot encoding on 'team' column 
encodered_data = pd.DataFrame(encoder.fit_transform(data[cols]).toarray())

#merge one-hot encoded columns back with original DataFrame
final_df = data.join(encodered_data)
final_df.drop(cols, axis=1, inplace=True)

#view final df
final_df

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
stop_words = set(stopwords.words('english')) 

word_tokens = word_tokenize(data.cleaned[0].strip(' '))
my_var = [w for w in word_tokens if (not w in stop_words and len(w) > 1)]
my_var

In [None]:
import nltk
nltk.download('omw-1.4')

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
lemmatized = [lemmatizer.lemmatize(token) for token in my_var]

In [None]:
lemmatized

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')
vectorizer.fit_transform(data.cleaned)

In [None]:
feature_names = vectorizer.get_feature_names()
feature_names

In [None]:
data

In [None]:
count_vectorizer = TfidfVectorizer(
    analyzer="word", tokenizer=word_tokenize,
    preprocessor=None, stop_words='english', max_features=None)    

tfidf = count_vectorizer.fit_transform(data['cleaned'])

count_vectorizer.get_feature_names()

In [None]:
class_train = pd.DataFrame(tfidf.todense(), columns=count_vectorizer.get_feature_names())

In [None]:
class_train

In [None]:
lst = []
for i in class_train.columns:
    z = sum(class_train[i])/len(data)
    lst.append(z)


In [None]:
max(lst)

In [None]:
data

In [None]:
data.salary

In [None]:
data.rename(columns = {'salary':'salary_main'}, inplace = True)

In [None]:
data.columns

In [None]:
cols = ['text','cleaned']
data.drop(cols, axis=1, inplace=True)

In [None]:
data

In [None]:
final_df

In [None]:
final_df = data.join(class_train)
final_df

In [None]:
final_df.salary_main

In [None]:

y = final_df.salary_main[final_df.salary_main.notna()]
x = final_df.drop('salary_main',axis=1)

In [None]:
final_df

In [None]:



from sklearn.model_selection import train_test_split
y = final_df.salary_main[final_df.salary_main.notna()]
x = final_df.drop('salary_main',axis=1)
x_train, x_test, y_train, y_test = train_test_split(x,y, random_state=42, test_size=0.30, shuffle=True)

print(x_train.shape)
print(x_test.shape)




In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()


In [None]:
x.salary

In [None]:
x_test

In [None]:
y_train

In [None]:
clf = LogisticRegression(penalty='l2', class_weight='balanced')
clf.fit(x_train, y_train)

In [None]:
predictions = clf.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score, hamming_loss,precision_score,recall_score,f1_score,classification_report
predictions

In [None]:
print("Accuracy :",accuracy_score(y_test, predictions))

In [None]:

print("Hamming loss ",hamming_loss(y_test,predictions))

In [None]:
precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')

In [None]:
print("\nMicro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

In [None]:
precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')

In [None]:
print("\nMacro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

In [None]:
print("\nClassification Report")
print (classification_report(y_test, predictions))

In [None]:
predictions

In [None]:
%reset