In [302]:
import numpy as np
import pandas as pd
from scipy.stats import norm
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler


import re
from datetime import datetime, date, timedelta


import matplotlib.pyplot as plt
import seaborn as sns
from to_img import to_img



plt.rcParams['figure.figsize'] = (8, 6)
plt.rcParams['font.size'] = 17
import warnings # current version of seaborn generates a bunch of warnings that we'll ignore
warnings.filterwarnings("ignore")
sns.set(style="ticks", color_codes=True)



import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler, LabelEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import set_config
set_config(display='diagram')

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, log_loss, roc_auc_score, roc_curve, hamming_loss,precision_score,recall_score,f1_score

from confusion import make_confusion_matrix
import re

import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.figsize'] = (8, 8)
plt.rcParams['font.size'] = 17

import warnings
warnings.filterwarnings("ignore")
sns.set(style="ticks", color_codes=True)
%matplotlib inline

## 1. Data

The data consists of text scraped from every search result for 'data science/scientist' on Indeed.com using the [Requests](https://docs.python-requests.org/en/master/ "Requests Library"), Tor, and [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/ "BeautifulSoup") libraries.

### 1.1 Load and Inspect Table

In [303]:
# note the change in memory usage.
data = pd.read_csv('../app/data/total.csv')
print(len(data))
data.info(memory_usage='deep')
data.head()

: 

: 

The following table is a view of the raw data from the scrape. I'll build the target by reducing the pay column to floating point values; this will require text cleaning which will also be conducted for the rest of the table. Location will be split into new columns for City and State while and because Indeed's search results show dates relative to the date the query was made I'll need to use PostDate and ExtractDate build a DatePosted column. It is important to note that JobUrl and date-related columns will not be converted into features for the logistic regression, instead they will be reattached to the data at the end in order to provide additional information and insights through the EC2 application. The only rows that'l  be using for modelling are JobTitle, Company, Summary, Requirements, and Description.

In [None]:
data.extractDate.value_counts()

In [None]:
data.columns = map(str.lower, data.columns)
data = data.rename({'title': 'job_title'}, axis=1)

##### Data Dictionary
- company<br>
&nbsp;
&nbsp;name of company
- description<br>
&nbsp;full text description of job
- estimated_salary<br>
&nbsp;Indeed's salary estimation
- extractDate<br>
&nbsp;String(?), date record was webscrapped
- job_type_items<br>
&nbsp;formatted list of 'full-time', 'part-time', etc
- location<br>
&nbsp;state and city/remote
- postDate<br>
&nbsp;date job posting was created
- rating<br>
&nbsp;Indeed.com ascribed company ratings
- raw_desc_soup<br>
&nbsp;raw full description, retained for possible future use/analysis
- requirements<br>
&nbsp;employer-listed educational, aptitude, and experience requirements
- sal_guide_items<br>
&nbsp;formatted list containing redundant characters and the Indeed salary estimate
- salary<br>
&nbsp;if given by employer
- salary_and_jType<br>
&nbsp;formatted list combining salary from salfromsection and job_type_items
- salfromsection<br>
&nbsp;salary, if given, extracted from job description
- summary<br>
&nbsp;brief summaries of each job extracted from sach results
- title<br>
&nbsp;job position title
- url<br>
&nbsp;job posting/description address

In [None]:
# check for duplcates.
print(f'Number of dupes: {sum(data.duplicated())}')
data[data.duplicated() == True]

The likelihood of seeing duplicates coming from the raw scrapping is very unlikely, but this is still a good check to do in case something changes with Indeed.com's front-end code.

Missing values are another story. It seems that Indeed.com doesn't require employers to provide values for every feature, this certainly seems to be the case with requirements, not to mention everywhere salary information coud be held.\
Others, like estimated salary and ratings are generated by Indeed.com but don't exist for every posting.\
job_type_items and salary_and_jType will likely be collapsed into a new column for job types, I'll also do the same with all of the salary related features to build the model's target.

### 1.1 Diagnose Data
In order to proceed with any statistical or exploratory data analysis I will need to wrangle the data quite a bit.
Here, I'll list each of the current feature columns and briefly describe what I'll need to do with them.

Column Groups and Needs
Simple
Salary
Text
etc

#### 1.2 Features


In [None]:
data.nunique().sort_values(ascending=False)

In [None]:
#check for missing
print(data.isna().sum().sort_values(ascending=False))

In [None]:

print(f"The {data.shape[0]} rows of data have {data.shape[1]} features. Since this is a classification problem I'll be selecting one to be the target, leaving me with {data.shape[1]-1} dependent variables to work with.")


##### 1.2a Rating

As the only feature coming in with numeric (float) values I should just need to scale it. I'll address missing values by imputing zero. The rationale behind this is that an unrated company should simply be rated as zero.
print(sorted(data.rating.unique()))

In [None]:
data[data.rating.isnull()].head(3)

In [None]:
def missing(data,feature):
    data_len = len(data)
    missing_len = len(data[data[feature].isnull()])
    try:
        print(f'Percent Missing: {round(data_len/missing_len,1)}%')
    except:
        print('nothing missing')

    notna = data[feature].notna()

    print(f'Number Missing: {missing_len}')

    print(np.mean(data[notna]['rating']))

    return data[feature].describe()
        
missing(data, feature = 'rating')



For now I'll drop these. I'm considering what and how to impute. What I'd like to do is run clustering on the data, or perhaps logistic regression. One thing I know is that the picture will become clearer as I gain more data. For now though, the amount of missing values is very small.

In [None]:
data = data[data['rating'].notnull()]
print(len(data))

ax = sns.distplot(data.rating, fit=norm, bins=data.rating.nunique(), kde=True)
plt.title('raw')
#ax.set_xticks(data.rating.unique())
plt.axvline(x=np.mean(data.rating), label='mean', c = 'r')
plt.legend()
plt.show()

outliers

In [None]:
outliers = []
def z_detect(data, feature):
    sample = data[feature]
    threshold=3
    mean = np.mean(sample)
    std = np.std(sample)
    
    for i in sample:
        z_score = (i-mean)/std
        if np.abs(z_score) > threshold:
            outliers.append(i)
    if len(outliers) == 0:
        pass
    else:
        return sorted(outliers)
z_detect(data, feature = 'rating')


In [None]:
def iqr_detect(data):
    q1 = np.quantile(data.rating, 0.25)
    q2 = np.quantile(data.rating, 0.5)
    q3 = np.quantile(data.rating, 0.75)

    # calc iqr
    iqr = (q3 - q1)
    
    # expand iqr to discern outliers
    iqr_x = iqr*1.5

    # setting the lower and upper limits
    iqr_lower = q1-iqr_x
    iqr_upper = q3+iqr_x

    # report
    print(f'There are {len(data[data.rating > iqr_upper])} records below and {len(data[data.rating < iqr_lower])} above.')



    # plotting
    sns.displot(data.rating)
    plt.axvline(x=q1, label="Q1", c = 'g')
    plt.axvline(x=q2, label="Q2", c = 'b')
    plt.axvline(x=q3, label="Q3", c = 'r')

    plt.axvline(x=iqr_lower, label = 'IQR Lower', c = 'black')
    plt.axvline(x=iqr_upper, label = 'IQR Upper', c = 'black')
    plt.axvline(x=np.mean(data.rating), label='mean', c = 'y')
    plt.xticks(rotation=30)
    plt.legend()
    plt.show()

    sns.boxplot(data=data, x='rating')
    plt.xticks(rotation=30)
    plt.show()

    
    # trimming outliers
    trimmed = data[data.rating > iqr_upper]
    trimmed = data[data.rating < iqr_lower]
    print(len(data))
    data = data[data.rating <= iqr_upper]
    data = data[data.rating >= iqr_lower]
    print(len(data))



    # plotting
    sns.displot(data.rating)
    plt.axvline(x=q1, label="Q1", c = 'g')
    plt.axvline(x=q2, label="Q2", c = 'b')
    plt.axvline(x=q3, label="Q3", c = 'r')

    plt.axvline(x=iqr_lower, label = 'IQR Lower', c = 'black')
    plt.axvline(x=iqr_upper, label = 'IQR Upper', c = 'black')
    plt.axvline(x=np.mean(data.rating), label='mean', c = 'y')
    plt.xticks(rotation=30)
    plt.legend()
    plt.show()

    sns.boxplot(data=data, x='rating')
    plt.xticks(rotation=30)
    plt.show()
    return data, trimmed

data, trimmed = iqr_detect(data)

In [None]:
ax = sns.distplot(data.rating, fit=norm, bins=data.rating.nunique(), kde=True)
plt.title('raw')
#ax.set_xticks(data.rating.unique())
plt.axvline(x=np.mean(data.rating), label='mean', c = 'r')
plt.legend()
plt.show()


In [None]:
data.rating.describe()

In [None]:


scalar = StandardScaler()
scaled_ratings = pd.DataFrame(scalar.fit_transform(pd.DataFrame(data.rating)), columns=['rating'])

scalar = StandardScaler()
scaled_ratings = pd.DataFrame(scalar.fit_transform(pd.DataFrame(data.rating)), columns=['rating'])
ax = sns.distplot(scaled_ratings, fit=norm, bins=data.rating.nunique(), kde=True)
plt.title('scaled')
#ax.set_xticks(data.rating.unique())
plt.axvline(x=np.mean(scaled_ratings.rating), label='mean', c = 'r')
plt.legend()
plt.show()

scaled_ratings.rating.describe()


##### 1.2a Dates

'extractdate' & 'postdate'\
extract_date is a string showing the day the record was webscrapped.
post_date is a string description of the relative age of each post from the date it was posted to the day it was webscrapped.
Converting post_date to a number will give me the the posts ae, which I'll then subtract from it's extract date in order to build a new feature carrying values for the DATE each job posting has been submitted to Indeed.com.

The extractdate is the value given by my parser representing the date the data was pulled from Indeed.com.\
The postdate is how many days have passed since the post was made and when the search was conducted (by my webscraper).\
The task is to find the delta (difference), the intervening days since the post was made, and subtract it from the extract date so I can get the actual date publishd.

In [None]:
data[['extractdate', 'postdate']].head()

In [None]:
print(missing(data, feature = 'extractdate'),'\n')
print(missing(data, feature = 'postdate'))

In [None]:
print(data.postdate.unique(),'\n')
print(data.extractdate.unique(),'\n')
print(data.postdate.value_counts(),'\n')
print(data.extractdate.value_counts())

In [None]:
def postD(row):
    """Critical function that converts postdate values into relative (in days) distance from the extract date.
    Args:
        data (string): text descrption of how old the post is.
    Returns:
        npnan or int: numeric representation of post age
    """
    if row['postdate'] != np.nan:
        if row['postdate'] in ['PostedToday', 'PostedJust posted', 'Hiring ongoing']:
            delta = timedelta(0)
        elif row['postdate'] in ['PostedPosted 1 day ago', 'Posted1 day ago']:
            delta = timedelta(1)

         
        value = pd.to_datetime(row['extractdate'])-delta
    return value



data['dateposted'] = data.apply( lambda row : postD(row), axis = 1)

data[['extractdate', 'postdate','dateposted']].head()

##### 1.2b Salary

'salary'\
Several different approaches will need to be used to extract salary information from this feature because employers provide this information themselves.
For one example, the pay periods of hour, day, week etc will need to be scaled to an anuual scale so I can have a standard scale to start wiith.
The salary ranges also appear here, so I'll be extracting numeric string characters, converting them into floats representing the lower and upper bounds of each range, and taking their medaian value.

'estimated_salary'\



'sal_guide_items'\


'salary_and_jtype' & 'salfromsection'\
These will be wrangled in much the same way. The only other pertinent information in these features would be the job_type data already found in job_type_items,


The 'salary' feature is whatever information the employer provided. There are several subgroups within these values:\
&nbsp;work week\
While I could convert these to lists, the only information I need (salary) can be parsed out.


The 'estimated_salary' feature 
Indeed.com provides its own salary estimates in what appear to be bins.
These and all other salary ranges will be collapsed to their median values once I've extracted and converted their numeric string reresentations.



The task is to find the delta (difference), the intervening days since the post was made, and subtract it from the extract date so I can get the actual date publishd.

In [None]:
print(data.salary.unique()[:])
print(data.estimated_salary.unique()[:3],'\n')
data[['salary','estimated_salary']].head(3)
x = data[['salary','estimated_salary']]
for i in x:
    print(i)


In [None]:
def justNumbers(data):
    if type(data) == str:
        # for estimatedsalary col
        if bool(re.search(r'Not provided by employer', data)):
            est = data.split(',')
            est = est[2].split('a year')[0]
            est = re.sub(r'[(a-zA-Z,&,$)]+' '|:]',r'',est)
            est = est.replace("-","")
            est = est.replace('"',"")
            est1 = est.split()[0]
            est2 = est.split()[1]
  


            return (float(est1)+float(est2))/2
        # for salary col
        else:
            if bool(re.search(r'$', data)):
                if bool(re.search(r'\d', data)):
                    if bool(re.search(r' - ', data)):
                        z = data.split(' - ')
                        sal = z[0]+' '+z[1]
                    else:

                        sal = data


                    sal = re.sub(r'[?|!|\'|"|#|/|-|(|)|$|' '|:]',r'',sal)
                    sal = re.sub(r'[(a-zA-Z,&)]+' '|:]',r'',sal)
                    sal = sal.replace("-","")
                    sal = sal.strip()
                    if bool(re.search(r' ', sal)):
                        sal1 = sal.split(' ')[0]
                        sal2 = sal.split(' ')[1]
                        # Assuming a forty hour workweek.
                        if bool(re.search(r'hour', data)):
                            return ((float(sal1)+float(sal2))/2)*40*(365/7)
                        else:
                            return (float(sal1)+float(sal2))/2
                    else:
                        return float(sal)
            
        return None

        



data['cleanedsal'] = data.salary.apply(justNumbers)
data['cleanedest'] = data.estimated_salary.apply(justNumbers)

data['cleanedsal'].unique()

In [None]:
def salaries(row):
    """Critical function that converts postdate values into relative (in days) distance from the extract date.
    Args:
        data (string): text descrption of how old the post is.
    Returns:
        npnan or int: numeric representation of post age
    """
    if (row['cleanedest'] >0) or (row['cleanedsal'] >0):
        if (row['cleanedest'] >0):
            return round(float(row['cleanedest']) ,2)
        else:
            return round(float(row['cleanedsal']) ,2)

data['salaries'] = data.apply( lambda row : salaries(row), axis = 1).astype(float)


In [None]:
#TODO There's no overlap here, which is anticipated. owever, I need to come back and see what's being cleaned out/lost.
cols = ['salary','cleanedsal','estimated_salary','cleanedest','salaries']
for i in cols:
    print(i)
    print(len(data[i].unique()),'\n')

data[cols].head(3)

In [None]:
data[data.salaries.notna()].head()

## Text


In [None]:
data.columns

In [None]:
# str_features = data[['company','job_title','location','summary','description']]
str_features = data[['company','job_title']]
str_features.head(5)


In [None]:
def cleanText_comp_jtitle(sentence):
    
    #function to clean the word of any punctuation or special characters

    cleaned = sentence.replace('AI/ML','artificial intelligence machine learning')
    cleaned = cleaned.replace('AI','artificial intelligence')
    cleaned = cleaned.replace('ML','machine learning')
    cleaned = cleaned.replace('Sr.','senior')
    cleaned = cleaned.lower()
    cleaned = re.sub(r'[^a-zA-Z0-9 \n\.|.]+', r'', cleaned)
    cleaned = cleaned.replace('  ',' ')

    return cleaned



str_features = data[['company','job_title']]
for i in str_features:
    data[i] = data[i].apply(cleanText_comp_jtitle)



#before = [data.job_title]
#after = [data2.job_title]


"""
for i,j in zip(before[0],after[0]):
    #print(i)
    print(j)
"""

In [None]:

def cleanText_loc(row):
    
    #function to clean the word of any punctuation or special characters
    states = [', AK', 'Alaska', ', AL', 'Alabama', ', AR', 'Arkansas', ', AZ', 'Arizona', ', CA', 'California', ', CO', 'Colorado', ', CT', 'Connecticut', ', DC', 'District of Columbia', ', DE', 'Delaware', ', FL', 'Florida', ', GA', 'Georgia', ', HI', 'Hawaii', ', IA', 'Iowa', ', ID', 'Idaho', ', IL', 'Illinois', ', IN', 'Indiana', ', KS', 'Kansas',
 ', KY', 'Kentucky', ', LA', 'Louisiana', ', MA', 'Massachusetts', ', MD', 'Maryland', ', ME', 'Maine', ', MI', 'Michigan', ', MN', 'Minnesota', ', MO', 'Missouri', ', MS', 'Mississippi', ', MT', 'Montana', ', NC', 'North Carolina', ', ND', 'North Dakota', ', NE', 'Nebraska', ', NH', 'New Hampshire', ', NJ', 'New Jersey', ', NM', 'New Mexico',
 ', NV', 'Nevada', ', NY', 'New York', ', OH', 'Ohio', ', OK', 'Oklahoma', ', OR', 'Oregon', ', PA', 'Pennsylvania', ', RI', 'Rhode Island', ', SC', 'South Carolina', ', SD',
 'South Dakota', ', TN', 'Tennessee', ', TX', 'Texas', ', UT', 'Utah', ', VA', 'Virginia', ', VT', 'Vermont', ', WA', 'Washington', ', WI', 'Wisconsin', ', WV', 'West Virginia',
 ', WY', 'Wyoming']

    for i in states:
        if i in row['location']:
            row['state'] = i.strip(',')
        if ',' in row['location']:
            row['city'] = row['location'].split(',')[0].replace('Remote in ','')
            row['city'] = row['city'].replace('Hybrid','')
            row['city'] = row['city'].replace('remote in ','').strip()
    
    if 'remote in ' in row['location']:
        row['remote'] = 'remote in'
    elif row['location'] == 'Remote':
        row['remote'] = 'remote'


    return row

data = data.copy()
data['remote'] = ''
data['state'] = ''
data['city'] = ''
data = data.apply( lambda row : cleanText_loc(row), axis = 1)
data[['location','state','city','remote']].head(3)

In [None]:
len(data)
data.dropna(subset=['description'], inplace=True)
len(data)

In [None]:
def cleadText_descriptive(row):
    text = row['summary']+' '+row['description']

    
    sentences = text.split('.')
    clean_sentences = []
    for i in sentences:
        clean_sentence = re.sub(r'[?|!|\'|"|#|/|-|,(|)|$|-|' '|:]',r'',i)
        clean_sentence = re.sub(r'[?|!|\'|"|#|/|-|,(|)|$|-|' '|:]',r'',clean_sentence)
        clean_sentence = " ".join(re.findall("[(a-zA-Z,&)]+", clean_sentence))
        clean_sentences.append(clean_sentence)
    clean_text = ''
    for i in clean_sentences:
        clean_text+=' '
        clean_text+=i
        clean_text = clean_text.strip().lower()
    row['text'] =  clean_text
    return row


data = data.copy()
data['text'] = ''
data = data.apply( lambda row : cleadText_descriptive(row), axis = 1)
data[['summary','description','text']].head(3)



In [None]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence


data2['text'] = data['text'].apply(stemming)
data2.text[0]

In [None]:
from nltk.stem import WordNetLemmatizer



lemmatizer = WordNetLemmatizer()
def lemming(sentence):
    LemSentence = ""
    for word in sentence.split():
        stem = lemmatizer.lemmatize(word)
        LemSentence += stem
        LemSentence += " "
    LemSentence = LemSentence.strip()
    return LemSentence


data['text'] = data['text'].apply(lemming)
data.text[0]

In [None]:
data.head(1)

In [None]:
data.salaries.isna()

In [None]:
sal_data = data[data.salaries.notna()]
sal_data.salaries.astype('float')

In [None]:
q1 = np.quantile(sal_data.salaries, 0.25)
q2 = np.quantile(sal_data.salaries, 0.5)
q3 = np.quantile(sal_data.salaries, 0.75)

In [None]:

def Q(row):
    """Subtracts the distance (in days) provided by the postD function.
    Args:
        row (record in the data): record

    Returns:
        datetime object: extract date - relative age = actual post date
    """

    salary = row['salaries']
    if salary < q1:
        return 1
    if (salary >= q1) & (salary < q2):
        return 2
    if (salary >= q2) & (salary < q3):
        return 3
    if salary >= q3:
        return 4
    else:
        return np.nan


data['target'] = data.apply( lambda row : Q(row), axis = 1)


In [None]:
ohe = OneHotEncoder(sparse=False, dtype='int')
targets = ohe.fit_transform(pd.DataFrame(data.target))
targets

In [None]:

targets = pd.DataFrame(targets,columns=['Q1','Q2','Q3','Q4','unk'])
data = data.join(targets)
data.head(1)

In [None]:
data.drop(['unk'], axis=1,inplace=True)

In [None]:
data

In [None]:
data.info()

In [None]:
data = data[['company','job_title','state','city','rating','text',,'url', 'Q1', 'Q2', 'Q3', 'Q4']]

In [None]:
data[data['url'].isna()].head()

In [None]:
data.head(1)

In [None]:
targets = ['Q1','Q2','Q3','Q4']
X = data.drop(targets, axis=1)
X = X.drop(['summary','dateposted','summary'])

In [None]:
data.head(1)

In [None]:
def grid_search_pipe():
    targets = ['Q1','Q2','Q3','Q4']
    X = data.drop(targets, axis=1)

    le_cols = ['company', 'job_title', 'state', 'city']
    scal_cols = ['rating']

    evaluations = {}

    
    for i in targets:
    # test/train split
        y = data[i]
        x_train, x_test, y_train, y_test = train_test_split(X, y , test_size=.2, random_state=42)


        preprocessor = ColumnTransformer(
            transformers=[
                ('text', TfidfVectorizer(ngram_range=(1,3), analyzer = 'word',max_features=1000,stop_words='english',decode_error='ignore'), 'text'),
                ('category', OneHotEncoder(handle_unknown ='ignore'), le_cols),
                ('scaler', StandardScaler(), scal_cols)
                
                ],remainder='drop')
        #   populating parameter grid to search
        grid = [
            {
            'classifier' : [LogisticRegression()],
            #'classifier__penalty' : ['l1', 'l2'],
            #'classifier__C' : np.logspace(-4, 4, 20),
            'classifier__solver' : ['liblinear']}
            ]

        pipe = Pipeline(
            steps=[
            ('preprocessor', preprocessor),
            ('classifier', LogisticRegression(grid)),
            ],
            )


        
        grid_search = GridSearchCV(pipe, param_grid=grid, verbose=2, return_train_score=True)
        grid_search.fit(x_train,y_train)

        print("Best parameter (CV score=%0.3f):" % grid_search.best_score_)
        print(grid_search.best_params_)


        if i not in evaluations.keys():
            evaluations[i] = {
                'grid_search':grid_search,
                'x_train':x_train,
                'y_train':y_train,
                'x_test':x_test,
                'y_test':y_test
                
            }


    return evaluations


evaluations = grid_search_pipe()

In [None]:



def grid_search_pipe():
    targets = ['Q1','Q2','Q3','Q4']
    X = data.drop(targets, axis=1)

    le_cols = ['company', 'job_title', 'state', 'city']
    scal_cols = ['rating']

    evaluations = {}

    
    for i in targets:
    # test/train split
        y = data[i]
        x_train, x_test, y_train, y_test = train_test_split(X, y , test_size=.2, random_state=42)


        preprocessor = ColumnTransformer(
            transformers=[
                ('text', TfidfVectorizer(ngram_range=(1,3), analyzer = 'word',max_features=1000,stop_words='english',decode_error='ignore'), 'text'),
                ('category', OneHotEncoder(handle_unknown ='ignore'), le_cols),
                ('scaler', StandardScaler(), scal_cols)
                
                ],remainder='drop')
        #   populating parameter grid to search
        grid = [
            {
            'classifier' : [LogisticRegression()],
            #'classifier__penalty' : ['l1', 'l2'],
            #'classifier__C' : np.logspace(-4, 4, 20),
            'classifier__solver' : ['liblinear']}
            ]

        pipe = Pipeline(
            steps=[
            ('preprocessor', preprocessor),
            ('classifier', LogisticRegression(grid)),
            ],
            )


        
        grid_search = GridSearchCV(pipe, param_grid=grid, verbose=2, return_train_score=True)
        grid_search.fit(x_train,y_train)

        print("Best parameter (CV score=%0.3f):" % grid_search.best_score_)
        print(grid_search.best_params_)


        if i not in evaluations.keys():
            evaluations[i] = {
                'grid_search':grid_search,
                'x_train':x_train,
                'y_train':y_train,
                'x_test':x_test,
                'y_test':y_test
                
            }


    return evaluations


evaluations = grid_search_pipe()

In [None]:
null = sum(data.salaries.isnull())
nnull = sum(data.salaries.notnull())
print('')
print(f'The data contains {data.shape[0]} rows of individual job postings with values for {data.shape[1]} columns based on\n different sections of each post\'s web page.')
print('')
print(f'- Out of {len(data)} job postings {nnull} or {round(nnull/len(data)*100,2)}% include some sort of salary information,\n- The remaining {null} rowws, or {round(null/len(data)*100,2)}% are missing salary data.')
#\n- Also, there are no null values in the rest of the table meaning we won\'t need\n  to lose any data by dropping rows. While the null values in the Pay column will\n  simply be dropped after we translate the not nulls into the target.')


data[data['salaries'].notna()].salaries
print(missing(data, feature = 'salaries'))

'job_type_items'\
    I'm going to treat these as likert values, converrting each of these string formatted lists into an ordinal values for a new column.


In [None]:
print(missing(data, feature = 'job_type_items'))

In [None]:


TODO

Fri - Sat - Sun
- make sure shit has zeros
- check if there is missing salary data from dropped columns
------ clean: company, description, location, summary, job_title
------ split location
------ join main text
- prepare main text for nlp

- drop cols and do eda

- gridsearch
- model
- evaluation
- collate all data
- find class characteristics, i.e. top employers/locations/words-phrases

- Code review and organizing
- readme


Monday
- plots
- review kmeans
- review indeed
- review the best 2-3 other projects
- categorical features


Tuesday
- review remaining projects
- categorical features (wrap up)
- numeric features
- 2hrs linear alg
- 2hrs stats
- 2hrs proba


Wednesday
- review all projects, particularly text/readmes
- review resume
- 2hrs Git: commits and prettify
- LinkedIn
- Job Sites
- 1hr linear alg
- 1hr stats
- 1hr proba
- look at jobs/need for CVs


Thurs
- 2hr SQL
- 1hr Numpy
- 1hr Pandas
- 1hr SciPyStats
- 2hr quiz
- Apply


Days
- 1hr SQL
- 1hr Quiz and Practice Questions
- 1hr Projects
- 1hr Reading/Watching Knoweledge


Off
- 2hr SQL
- 2hr Apply
- 3hr Projects
- 1hr Reading/Watching Knoweledge
- 2hr Quiz and Practice Questions
- 1hr linear alg or stats or proba





def jobtype(data):
    """Used for creating a new feature containing nominal values for the job type.

    Args:
        data (string): Values from the 'job_type_items' column.

    Returns:
        string: Extracted and simplified values for each category.
    """
    if type(data) == str:
        if re.search('full', data):
            out = 'full'
        if re.search('part', data):
            out = 'part'
        if re.search('contract', data):
            out = 'contract'
        if re.search('temp', data):
            out = 'temp'
        return out
    else:
        pass

cols = ['company','description', 'job_type_items','location','postdate','requirements',	'summary','job_title']
for txt_col in cols:
    data[txt_col] = data[txt_col].str.lower()



data['jobtype'] = data.job_type_items.apply(jobtype)

data['text'] = data.description+' '+data.summary
#TODO add requirements

cols = ['job_type_items','raw_desc_soup','url','description','summary','requirements']


data.text = data.text.replace(')',' ')



data = annual(data)
#data.schedule.value_counts() keep and eye on this, may have to deal with more than just hourly

'location'\
This feature holds a wealth of information: state, city/remote, and in some cases a zip code.
Zip codes seem to be a recent addition to the information I can get from Indeed.com but since its presence is rather sparse I'll just extract city/remote and state values, splitting them into their respective columns.

In [None]:
data.job_type_items.unique()

'job_title'\
Although the webscrapper searches for 'data scientist' the results contain over 80 unique job titles. This is largely due to a prefix or suffix that provides the role's specialization at that particular company for that particular job posting. These include items like 'Marketing', 'Senior', 'Geospatial...'.
I'll start by cleaning and simplifying these values, i.e. appearences of 'Sr' can be converted to 'Senior'. This will significantly reduce the number of unique values, hopefully increasing model accuracy.

In [None]:
print(len(data.job_title.unique()))
data.job_title.unique()[:10]

'requirements'\
I will most likely merge this in with the description prior to conducting NLP. The rationale is that these explicit requirements may be echoed in the description or summary, so by adding them to the corpus I'll raise the scores derived from their increaseed frequency. Eventually I wiuld like to build a set of requirements that I can use in with a clustering algorithm, like KNN, to find/define subgroups within the sample population.

In [None]:
data.requirements[0]

'summary'\
Much like the description, these are unique. COming from the search results page itself, these are brief descriptions of each job. In most cases this is not merely an abridgement of the summary. Like 'requirement' I'll start by lumping these texts in with description so analysis of the corpus can be boosted by the added descriptions.

In [None]:
for i in data.summary[:3]:
    print(i)

'description'\
Each value is a unique string of the full job description. I'll lower and remove special characters but this feature will be dealt with seperately in its own NLP process.


'raw_desc_soup'\
I'm only saving this in case I refine or change my parsing. Below is a truncated slice of one of the raw records.

'url'\
Like raw_doc_soup, I'm simply keeping these in case I need to go back and re-parse the raw data. In fact, this URL points directly to the page where that raw doc soup is found.
I may also be able to detect job post updates and duplicates by using these unique addresses.

In [None]:
data.raw_desc_soup[1][:1000]
data.url[0]

### 2.3 Wrangling & Feature Engineering
Below are the implementations of the data wrangling steps I described above for each feature. Using Numpy, Regex, and Pandas I'll simutaneaously clean the strings, extract and convert numeric information, and create new feature columns.

Let's see what the data looks like at this point. It looks like the 17 original features have been reduced to 11. Most of the columns have been collapsed and cleaned, too. Just a bit more work till this is ready for EDA, NLP, and transformations.

In [None]:
data

In [None]:


data.rating = data.rating.fillna(0)

cols = ['extractdate','postdate','schedule','jobtype','salary']
data.drop(cols,inplace=True,axis=1)
data = data.rename({'annual_sal': 'salary'}, axis=1)

data['salary'] = data['salary'].apply(sal_fixer)

In [None]:
data.salary

In [None]:
data.columns
cols = ['salary','estimated_salary', 'job_type_items','raw_desc_soup', 'sal_guide_items','salary_and_jtype', 'salfromsection']
data.drop(cols,inplace=True,axis=1)
data.rename(columns={'final_sal': 'salary'},inplace=True)

data = data[data.description != 'NaN']

len(data)

data.description.value_counts()[1:]

### 2.4 Assessment

In [None]:
data.info(memory_usage='deep')
data.head()

In [None]:
shape = data.shape
print(f"The {shape[0]} rows of data now have {shape[1]} variables whose values are now cleaned and typed properly.\nThe only missing values are in Salary which is my target.\nNext, I'll conduct analysis, encoding, and scaling of each feature, starting with Salary which .")
data.describe(include = 'all')


In [None]:
sns.countplot(x='salary', data=data.notnull(), palette='Set3')
plt.show()

In [None]:
null = len(data[data.salary.isnull()])
nnull = len(data[data.salary.notnull()])
print(f'{round(nnull/len(data)*100,2)}% of the data has salary information, should be an easy target')

In [None]:
data = data[data['text'].notna()]

In [None]:
data.to_csv(f'../app/data/wrangled_data.csv', index=False)

In [None]:
data

In [None]:
munged = data
munged

In [None]:
null = len(munged[munged.salary.isnull()])
nnull = len(munged[munged.salary.notnull()])
print(f'Droping {len(data)-len(munged)} duplicates and out of date posts leaves {len(munged)} rows of data for the regressor to be trained and tested on.')
print(f'- Of those {len(munged)} job postings {nnull} or {round(nnull/len(munged)*100,2)}% include salary information,\n- The remaining {null} rows, or {round(null/len(munged)*100,2)}% are missing salary data.')

## EDA

In [None]:
import numpy as np
import pandas as pd

from sklearn import metrics
from scipy.stats import pearsonr, chi2_contingency
from sklearn.preprocessing import StandardScaler, MinMaxScaler



import matplotlib.pyplot as plt
from datetime import datetime, date, timedelta

plt.rcParams['figure.figsize'] = (8, 8)
plt.rcParams['font.size'] = 17

import seaborn as sns
import re
import warnings # current version of seaborn generates a bunch of warnings that we'll ignore
warnings.filterwarnings("ignore")
sns.set(style="ticks", color_codes=True)
%matplotlib inline

### Target

### Numeric Features
### Categorical Features
### Feature Associations
#### Feature Selection and Hyperparameter Tuning
### Pipeline GridSearch
## Modelling
## Evaluations

## Detecting Outliers: Using both the Z-Score and IQR methods
    Now that preprocessed the data and building the target variable of Salary let's see conduct a bit more EDA to see if there are any outliers that could potentially skew how the regressor will learn from the training data.

In [None]:
ax = sns.pairplot(data=munged[munged.salary.notnull()],
                  x_vars=['dateposted'],
                  y_vars=['salary'], height=8, hue="salary", palette="crest")

ax.fig.set_size_inches(18,4)

A plot is helpful in isualizing data, but let's program a few functions to detect and remove outliers based on thresholds.
With the Z-Score, we can determine any data outside 3 standard deviations from the mean of our salary data to be an outlier.
Wheras with the interquartile (IQR) range, the middle 50% of given salaries, I'll set the conventional cutoff of 1.5+/- the IQR as the cutoff.
Values found outside of either range will be collected into a list and that'll allow for some fancy indexing so those particular postings can be examined and removed programatically.

#### Z-Score
Salaries 3 standard deviations away from the mean will be listed as outliers.

In [None]:
outliers = []
def z_detect(munged):
    sample = munged[munged['salary'].notnull()].salary
    threshold=3
    mean = np.mean(sample)
    std = np.std(sample)
    
    for i in sample:
        z_score = (i-mean)/std
        if np.abs(z_score) > threshold:
            outliers.append(i)
    if len(outliers) == 0:
        pass
    else:
        return outliers
z_detect(munged)

In [None]:
munged

#### IQR
Salaries outside 1.5 times the interquartile range boundaries, either above or below will be listed as outliers.

In [None]:
def iqr_detect(munged):
    sample = munged[munged['salary'].notnull()].salary
    Q1, Q3 = np.percentile(sample,[25,75])
    iqr = Q3-Q1
    lower_bound = Q1-(1.5*iqr)
    upper_bound = Q3+(1.5*iqr)
    for i in sample:
        if (i < lower_bound)  | (i > upper_bound):
            outliers.append(i)
    if len(outliers) == 0:
        pass
    else:
        return outliers
set(iqr_detect(munged))

In [None]:
def unique(list1):
    list_set = set(list1)
    # convert the set to the list
    unique_list = (list(list_set))
    for x in unique_list:
        print(x)
        
mn = min(outliers)
mx = max(outliers)
print(f'The {len(set(outliers))} Unique Outliers Are:')
unique(outliers)
print(f'- With a minumum of ${round(mn)} and a maximum of ${round(mx)}')

In [None]:
# Take a look at those tables.

outliers = munged[munged['salary'].isin(outliers)]
munged.drop(outliers.index, axis=0,inplace=True)
outliers

In [None]:
null = len(munged[munged.salary.isnull()])

nnull = len(munged[munged.salary.notnull()])

In [None]:
ax = sns.pairplot(data=munged[munged.salary.notnull()],
                  x_vars=['dateposted'],
                  y_vars=['salary'], height=8, hue="salary", palette="crest")

ax.fig.set_size_inches(18,4)

In [None]:
sns.countplot(x='salary', data=munged.notnull(), palette='Set3')
print(f'- Dropping {len(outliers)} outliers now leaves {nnull}, or {round(nnull/len(munged)*100,2)}%, of rows with with salary information,\n- The remaining {null} rows, or {round(null/len(munged)*100,2)}%, are missing salary data.')


In [None]:
munged.info()

In [None]:
munged.to_csv(f'../app/data/wrangled_data.csv', index=False)

In [None]:
munged.info()

In [None]:
munged.info(memory_usage='deep')

In [None]:
munged.drop('location',inplace=True,axis=1)

In [None]:
munged.info(memory_usage='deep')