In [107]:
import pandas as pd  # Import the pandas library for data manipulation
import numpy as np  # Import numpy for numerical operations
from sklearn.preprocessing import StandardScaler  # Import StandardScaler for feature scaling
from sklearn.decomposition import PCA  # Import PCA for dimensionality reduction
from sklearn.model_selection import train_test_split  # Import train_test_split for splitting data into training and test sets

In [170]:
# Load the dataset from a CSV file into a DataFrame
df = pd.read_csv('data2.csv')

In [171]:
# Get unique company sizes from the 'Company Size' column and print them
unique_company_sizes = df['Company Size'].unique()
print(unique_company_sizes)

['Medium-Large (501-1000)' 'Giant (10000+)' 'Large (1001-5000)'
 'Small (51-200)' 'Medium (201-500)' 'Very Large (5001-10000)'
 'Very Small (1-50)' 'Unknown' nan]


In [172]:
# Display the number of rows in the DataFrame
len(df)

2177

In [173]:
# Show the first few rows of the DataFrame for inspection
df.head()

Unnamed: 0,Job Title,Job Description,Rating,Company Name,Location,Founded,Type of ownership,Industry,Sector,Median Salary,Company Size,Median Revenue
0,Data Scientist,"Data Scientist\nLocation: Albuquerque, NM\nEdu...",3.8,Tecolote Research\n3.8,"Albuquerque, NM",1973,Company - Private,Aerospace & Defense,Aerospace & Defense,72000.0,Medium-Large (501-1000),75000000.0
1,Healthcare Data Scientist,What You Will Do:\nI. General Summary\nThe Hea...,3.4,University of Maryland Medical System\n3.4,"Linthicum, MD",1984,Other Organization,Health Care Services & Hospitals,Health Care,87500.0,Giant (10000+),3500000000.0
2,Data Scientist,"KnowBe4, Inc. is a high growth information sec...",4.8,KnowBe4\n4.8,"Clearwater, FL",2010,Company - Private,Security Services,Business Services,85000.0,Medium-Large (501-1000),300000000.0
3,Data Scientist,*Organization and Job ID**\nJob ID: 310709\nDi...,3.8,PNNL\n3.8,"Richland, WA",1965,Government,Energy,"Oil, Gas, Energy & Utilities",76500.0,Large (1001-5000),1000000000.0
4,Data Scientist,Data Scientist\nAffinity Solutions / Marketing...,2.9,Affinity Solutions\n2.9,"New York, NY",1998,Company - Private,Advertising & Marketing,Business Services,114500.0,Small (51-200),


In [174]:
# Check for missing values in the DataFrame
df.isna().sum()

Unnamed: 0,0
Job Title,6
Job Description,5
Rating,78
Company Name,13
Location,8
Founded,297
Type of ownership,11
Industry,157
Sector,124
Median Salary,437


In [175]:
# List the columns of the DataFrame
df.columns

Index(['Job Title', 'Job Description', 'Rating', 'Company Name', 'Location',
       'Founded', 'Type of ownership', 'Industry', 'Sector', 'Median Salary',
       'Company Size', 'Median Revenue'],
      dtype='object')

In [176]:
# Display the number of rows in the DataFrame again
len(df)

2177

In [177]:
# Check for duplicate rows in the DataFrame and count them
df.duplicated().sum()

719

In [178]:
# Remove duplicate rows, keeping the first occurrence
df = df.drop_duplicates(keep='first')

In [179]:
# Get unique company sizes again after dropping duplicates and print them
unique_company_sizes = df['Company Size'].unique()
print(unique_company_sizes)

['Medium-Large (501-1000)' 'Giant (10000+)' 'Large (1001-5000)'
 'Small (51-200)' 'Medium (201-500)' 'Very Large (5001-10000)'
 'Very Small (1-50)' 'Unknown' nan]


In [180]:
# Display the number of rows in the DataFrame after removing duplicates
len(df)

1458

In [181]:
# Print the count of missing values for each column
print(df.isnull().sum())

Job Title              6
Job Description        5
Rating                51
Company Name          12
Location               8
Founded              171
Type of ownership     10
Industry              92
Sector                80
Median Salary        272
Company Size          35
Median Revenue       779
dtype: int64


In [182]:
# Define columns that need to be converted to numeric types
columns_to_convert = ['Rating', 'Founded', 'Median Salary']

In [None]:
# Convert specified columns to numeric, coercing errors to NaN
for column in columns_to_convert:
    df[column] = pd.to_numeric(df[column], errors='coerce')

In [None]:
# List all numerical columns in the DataFrame
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns.tolist()

In [183]:
# Function to impute missing values using a Random Forest Regressor
def impute_missing_values(df, target_column):
    # Create a subset for training by selecting rows where the target column is not null
    train_df = df[df[target_column].notnull()]
    # Create a subset for testing by selecting rows where the target column is null
    test_df = df[df[target_column].isnull()]

    # Define features by selecting all numerical columns except the target column
    features = train_df.select_dtypes(include=['float64', 'int64']).columns.tolist()
    features.remove(target_column)

    # Check if there are valid features and sufficient training data
    if len(features) == 0 or train_df.empty:
        print(f'Skipping imputation for {target_column}: No valid training data or features available.')
        return df

    # Separate features and target for training
    X_train = train_df[features]  # Features for training
    y_train = train_df[target_column]  # Target variable for training

    # Split the training data into training and validation sets (80-20 split)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

    # Train a Random Forest Regressor on the training data
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Validate the model using the validation set
    y_pred = model.predict(X_val)  # Predictions on the validation set
    rmse = mean_squared_error(y_val, y_pred, squared=False)  # Calculate the root mean squared error
    print(f'Validation RMSE for {target_column}: {rmse}')

    # Predict the missing values in the test set if it is not empty
    if not test_df.empty:
        X_test = test_df[features]  # Features for prediction
        if len(X_test) > 0:  # Check if there are samples to predict
            predicted_values = model.predict(X_test)  # Predict missing values
            # Fill missing values in the original DataFrame with predicted values
            df.loc[df[target_column].isnull(), target_column] = predicted_values
        else:
            print(f'No data available for prediction in {target_column}.')

# Iterate over all numerical columns and impute missing values for each
for column in numerical_columns:
    impute_missing_values(df, column)



Validation RMSE for Rating: 0.5330566413330052




Validation RMSE for Founded: 61.84431029717669




Validation RMSE for Median Salary: 113817593.01281676
Validation RMSE for Median Revenue: 1806492190.3556607




In [184]:
# Display the 'Company Size' column for verification
df['Company Size']

Unnamed: 0,Company Size
0,Medium-Large (501-1000)
1,Giant (10000+)
2,Medium-Large (501-1000)
3,Large (1001-5000)
4,Small (51-200)
...,...
1912,Large (1001-5000)
1913,
1915,Giant (10000+)
1973,Medium (201-500)


In [187]:
# Check for missing values again in the DataFrame
df.isna().sum()

Unnamed: 0,0
Job Title,6
Job Description,5
Rating,0
Company Name,12
Location,8
Founded,0
Type of ownership,10
Industry,92
Sector,80
Median Salary,0


In [186]:
# Display the first few rows of the DataFrame after imputation
df.head()

Unnamed: 0,Job Title,Job Description,Rating,Company Name,Location,Founded,Type of ownership,Industry,Sector,Median Salary,Company Size,Median Revenue
0,Data Scientist,"Data Scientist\nLocation: Albuquerque, NM\nEdu...",3.8,Tecolote Research\n3.8,"Albuquerque, NM",1973.0,Company - Private,Aerospace & Defense,Aerospace & Defense,72000.0,Medium-Large (501-1000),75000000.0
1,Healthcare Data Scientist,What You Will Do:\nI. General Summary\nThe Hea...,3.4,University of Maryland Medical System\n3.4,"Linthicum, MD",1984.0,Other Organization,Health Care Services & Hospitals,Health Care,87500.0,Giant (10000+),3500000000.0
2,Data Scientist,"KnowBe4, Inc. is a high growth information sec...",4.8,KnowBe4\n4.8,"Clearwater, FL",2010.0,Company - Private,Security Services,Business Services,85000.0,Medium-Large (501-1000),300000000.0
3,Data Scientist,*Organization and Job ID**\nJob ID: 310709\nDi...,3.8,PNNL\n3.8,"Richland, WA",1965.0,Government,Energy,"Oil, Gas, Energy & Utilities",76500.0,Large (1001-5000),1000000000.0
4,Data Scientist,Data Scientist\nAffinity Solutions / Marketing...,2.9,Affinity Solutions\n2.9,"New York, NY",1998.0,Company - Private,Advertising & Marketing,Business Services,114500.0,Small (51-200),810025000.0


In [188]:
# Example to fill missing values for specific categorical columns with 'Unknown'
df['Job Title'].fillna('Unknown', inplace=True)
df['Job Description'].fillna('Unknown', inplace=True)
df['Company Name'].fillna('Unknown', inplace=True)
df['Location'].fillna('Unknown', inplace=True)
df['Type of ownership'].fillna('Unknown', inplace=True)
df['Industry'].fillna('Unknown', inplace=True)
df['Sector'].fillna('Unknown', inplace=True)
df['Company Size'].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Job Title'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Job Description'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are settin

In [189]:
# List the columns of the DataFrame for verification
df.columns

Index(['Job Title', 'Job Description', 'Rating', 'Company Name', 'Location',
       'Founded', 'Type of ownership', 'Industry', 'Sector', 'Median Salary',
       'Company Size', 'Median Revenue'],
      dtype='object')

In [193]:
# Import additional libraries for text processing
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer  # Import TF-IDF vectorizer for text feature extraction
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS  # Import English stop words
from nltk.corpus import stopwords  # Import stopwords from NLTK
from nltk.stem import WordNetLemmatizer  # Import lemmatizer for text normalization
from nltk.tokenize import word_tokenize  # Import tokenizer to split text into words
import nltk  # Import NLTK library

# Download the required NLTK resources
nltk.download('stopwords')  # Download stopwords
nltk.download('punkt')  # Download tokenizer
nltk.download('wordnet')  # Download WordNet for lemmatization

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [194]:
# Function to preprocess text data
def preprocess_text(text):
    # Tokenize the text and convert it to lowercase
    words = word_tokenize(text.lower())

    # Remove stopwords and non-alphabetic words
    stop_words = set(stopwords.words('english')).union(ENGLISH_STOP_WORDS)  # Combine NLTK and sklearn stop words
    additional_stopwords = ['position', 'responsible', 'required', 'experience', 'applicant', 'responsibility']  # Add custom stop words
    stop_words.update(additional_stopwords)

    # Filter out stopwords and non-alphabetic words
    words = [word for word in words if word.isalpha() and word not in stop_words]

    # Lemmatization: Normalize words to their base form
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(words)  # Return the processed text as a single string

# Preprocess all job descriptions in the DataFrame
df['processed_description'] = df['Job Description'].apply(preprocess_text)

# Function to extract top keywords for each job description independently
def extract_keywords(description):
    # Use TF-IDF to vectorize the job description
    vectorizer = TfidfVectorizer(max_features=20, ngram_range=(1, 3), min_df=1)  # Increased n-gram range for better keyword extraction
    tfidf_matrix = vectorizer.fit_transform([description])  # Vectorize the single job description

    # Extract top keywords based on their TF-IDF score
    feature_names = vectorizer.get_feature_names_out()  # Get feature names from the vectorizer
    tfidf_scores = tfidf_matrix.toarray()[0]  # Convert sparse matrix to dense array

    # Sort the keywords based on TF-IDF score (highest first)
    top_keywords_indices = tfidf_scores.argsort()[-10:][::-1]  # Get indices of the top 10 keywords
    top_keywords = [feature_names[i] for i in top_keywords_indices]  # Extract top keywords using indices

    return ', '.join(top_keywords)  # Return the top keywords as a comma-separated string

# Extract keywords for each job description and store them in a new column
df['Keywords'] = df['processed_description'].apply(extract_keywords)

In [197]:
# Combine various features into a single DataFrame for further analysis
features = pd.concat([df[['Rating', 'Founded', 'Median Salary', 'Median Revenue']], df_encoded, df['Keywords']], axis=1)

In [198]:
# Vectorize the 'Keywords' column to create a TF-IDF representation of keywords
tfidf_vectorizer = TfidfVectorizer(max_features=20, ngram_range=(1, 3), min_df=1)  # Define the vectorizer
keywords_tfidf = tfidf_vectorizer.fit_transform(df['Keywords']).toarray()  # Transform keywords into a TF-IDF array

In [199]:
# Concatenate the TF-IDF features with the existing features DataFrame
features = pd.concat([features.reset_index(drop=True), pd.DataFrame(keywords_tfidf)], axis=1)

In [None]:
# Import the NearestNeighbors class from the sklearn.neighbors module
from sklearn.neighbors import NearestNeighbors

# Initialize the K-Nearest Neighbors model
# - n_neighbors: The number of nearest neighbors to consider (set to 5 here).
# - n_jobs: The number of jobs to run in parallel. '-1' uses all processors.
knn_model = NearestNeighbors(n_neighbors=5, n_jobs=-1)  # Adjust the number of neighbors as needed

# Fit the KNN model to the features dataset
# This step involves training the model using the provided features, which will allow it to find the nearest neighbors based on the distances between points in the feature space.
knn_model.fit(features)