In [15]:
import boto3
import pandas as pd
from io import BytesIO

In [16]:
aws_access_key_id = ''
aws_secret_access_key = ''

bucket_name = ''

# Create an S3 client
s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)

# Specify the file key for the CSV file in AWS S3
file_key = 'with_20_keywords.csv'

# Retrieve the object from S3 bucket using the specified file key
obj = s3.get_object(Bucket=bucket_name, Key=file_key)

# Read the content of the object
content = obj['Body'].read()

# Create a Pandas DataFrame from the CSV content
final_df = pd.read_csv(BytesIO(content), engine='python')

In [17]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 40 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Title             1462 non-null   object 
 1   Description       1462 non-null   object 
 2   Company Name      1462 non-null   object 
 3   City              1462 non-null   object 
 4   State             1462 non-null   object 
 5   Salary            1462 non-null   int64  
 6   Year              1462 non-null   float64
 7   Month             1462 non-null   float64
 8   Day               1462 non-null   float64
 9   CS_keywords       1462 non-null   int64  
 10  python            1462 non-null   int64  
 11  work              1462 non-null   int64  
 12  data              1462 non-null   int64  
 13  experience        1462 non-null   int64  
 14  scientist         1462 non-null   int64  
 15  new               1462 non-null   int64  
 16  opportunities     1462 non-null   int64  


## Introducing 3 category of salary

In [18]:
# Find the quartiles
q1 = final_df['Salary'].quantile(0.33)
q2 = final_df['Salary'].quantile(0.67)

# Define the categories based on quartiles
final_df['Salary_Category'] = pd.cut(final_df['Salary'], bins=[-float('inf'), q1, q2, float('inf')], labels=['Low', 'Medium', 'High'])

# Check the distribution
print(final_df['Salary_Category'].value_counts())
print("33rd Percentile (Low to Medium threshold):", q1)
print("67th Percentile (Medium to High threshold):", q2)

Medium    513
Low       494
High      455
Name: Salary_Category, dtype: int64
33rd Percentile (Low to Medium threshold): 90000.0
67th Percentile (Medium to High threshold): 125000.0


## Model with improved dataset

In [40]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MaxAbsScaler
from sklearn.utils import check_array
from sklearn.preprocessing import LabelEncoder

# Specify the columns to exclude from summation
columns_to_exclude = ['Title', 'Description', 'Company Name', 'City', 'State', 'Salary', 'Year', 'Month', 'Day', 'CS_keywords']

# Select columns to sum by dropping the ones to exclude
columns_to_sum = final_df.drop(columns_to_exclude, axis=1)

# Add a new column 'hc_keywords' which contains the sum of all specified columns
final_df['hc_keywords'] = columns_to_sum.sum(axis=1)

# Drop the target variable and Description column
X = final_df[['Company Name', 'City', 'State','Year','Month', 'Day', 'hc_keywords']]
y = final_df['Salary_Category']

# Identify non-numeric columns
non_numeric_cols = X.select_dtypes(include=['object']).columns.tolist()

# Apply one-hot encoding to non-numeric columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), non_numeric_cols)
    ],
    remainder='passthrough'
)

# Fit and transform the entire dataset
X_encoded = preprocessor.fit_transform(X)

# Use ordinal encoding for the salary categories
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Normalize the data
scaler = StandardScaler(with_mean=False)  # Set with_mean=False for sparse matrices
X_normalized = scaler.fit_transform(X_encoded)

# Split the encoded and normalized data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.05, random_state=43)

# Define the parameter grid to search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Create a RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)

# Perform Randomized Search Cross Validation
random_search = RandomizedSearchCV(estimator=rf_classifier, param_distributions=param_grid, n_iter=100, cv=3, verbose=0, random_state=42, n_jobs=-1)

# Fit the random search model
random_search.fit(X_train, y_train)

# Get the best parameters and best score
print("Best Parameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

# Initialize and train the models
models = {
    "Random Forest": RandomForestClassifier(**random_search.best_params_, random_state=43),
    "Multi-layer Perceptron": MLPClassifier(random_state=42)
}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Evaluation report for {name}:")
    print(classification_report(y_test, y_pred))
    print()


  final_df['hc_keywords'] = columns_to_sum.sum(axis=1)


Best Parameters: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 30, 'bootstrap': True}
Best Score: 0.6037605303264051
Training Random Forest...
Evaluation report for Random Forest:
              precision    recall  f1-score   support

           0       0.79      0.50      0.61        22
           1       0.84      0.90      0.87        30
           2       0.61      0.77      0.68        22

    accuracy                           0.74        74
   macro avg       0.75      0.72      0.72        74
weighted avg       0.76      0.74      0.74        74


Training Multi-layer Perceptron...
Evaluation report for Multi-layer Perceptron:
              precision    recall  f1-score   support

           0       0.65      0.50      0.56        22
           1       0.62      0.70      0.66        30
           2       0.52      0.55      0.53        22

    accuracy                           0.59        74
   macro avg       0.60      0.58      0.58      

[CV] END bootstrap=False, max_depth=30, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   0.4s
[CV] END bootstrap=False, max_depth=20, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=   0.3s
[CV] END bootstrap=False, max_depth=20, min_samples_leaf=4, min_samples_split=2, n_estimators=300; total time=   0.6s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time=   0.2s
[CV] END bootstrap=False, max_depth=10, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   0.1s
[CV] END bootstrap=False, max_depth=10, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   0.2s
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.8s
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.9s
[CV] END bootstrap=False, max_depth=10, min_samples_l

[CV] END bootstrap=False, max_depth=30, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   0.4s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=   0.2s
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.6s
[CV] END bootstrap=False, max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   0.3s
[CV] END bootstrap=False, max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   0.3s
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.9s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   0.4s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, min_sampl

## Model without hc_keyword column features

In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import check_array

# Drop the target variable and Description column
X = final_df[['Company Name', 'City', 'State', 'Year', 'Month', 'Day']]
y = final_df['Salary_Category']

# Identify non-numeric columns
non_numeric_cols = X.select_dtypes(include=['object']).columns.tolist()

# Apply one-hot encoding to non-numeric columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), non_numeric_cols)
    ],
    remainder='passthrough'
)

# Fit and transform the entire dataset
X_encoded = preprocessor.fit_transform(X)

# Use ordinal encoding for the salary categories
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Normalize the data
scaler = StandardScaler(with_mean=False)  # Set with_mean=False for sparse matrices
X_normalized = scaler.fit_transform(X_encoded)

# Split the encoded and normalized data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.05, random_state=43)

# Define the parameter grid to search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Create a RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)

# Perform Randomized Search Cross Validation
random_search = RandomizedSearchCV(estimator=rf_classifier, param_distributions=param_grid, n_iter=100, cv=3, verbose=0, random_state=42, n_jobs=-1)

# Fit the random search model
random_search.fit(X_train, y_train)

# Get the best parameters and best score
print("Best Parameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

# Initialize and train the models
models = {
    "Random Forest": RandomForestClassifier(**random_search.best_params_, random_state=43),
    "Multi-layer Perceptron": MLPClassifier(random_state=42)
}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Evaluation report for {name}:")
    print(classification_report(y_test, y_pred))
    print()


Best Parameters: {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 30, 'bootstrap': True}
Best Score: 0.5771070781869918
Training Random Forest...
Evaluation report for Random Forest:
              precision    recall  f1-score   support

           0       0.67      0.45      0.54        22
           1       0.84      0.90      0.87        30
           2       0.56      0.68      0.61        22

    accuracy                           0.70        74
   macro avg       0.69      0.68      0.67        74
weighted avg       0.71      0.70      0.70        74


Training Multi-layer Perceptron...
Evaluation report for Multi-layer Perceptron:
              precision    recall  f1-score   support

           0       0.45      0.77      0.57        22
           1       0.67      0.47      0.55        30
           2       0.53      0.36      0.43        22

    accuracy                           0.53        74
   macro avg       0.55      0.53      0.52      