In [3]:
import pandas as pd
import numpy as np
import os


In [18]:
# Import data and read in files 

onet_salaries = pd.read_excel("dataset/onet_data_with_salaries.xlsx") 
onet_salaries.to_csv("dataset/onet_data_with_salaries.csv", index=False)
onet_human = pd.read_excel("dataset/onet_data_with_human_characteristics.xlsx") 
onet_human.to_csv("dataset/onet_data_with_human_characteristics.csv", index=False)
onet_complete = pd.read_excel("dataset/complete_onet_data_with_human_characteristics.xlsx")
onet_complete.to_csv("dataset/complete_onet_data_with_human_characteristics.csv", index=False)


mbti = pd.read_csv("dataset/myer-briggs-data.csv") 
onet_salaries = pd.read_csv("dataset/onet_data_with_salaries.csv") 
onet_human = pd.read_csv("dataset/onet_data_with_human_characteristics.csv") 


In [21]:
###TASK 1: Dataset Inspection###
#inspecting the datasets
def inspect_dataset(df,name="Dataset"):
  print(f"/n====={name} INSPECTION=====")

#looking through shape and info about the datasets
  print(f"Shape: {df.shape}")
  print(f"Columns: {df.columns}")
  print("Info:")
  print(df.info())

  print("\nHead:")
  print(df.head())
#missing values
  print("\nMissing Values per Column:")
  print(df.isnull().sum())

#summary statistics
  print("\nSummary Statistics:(numeric columns):")
  print(df.describe())

  #for checking outliers
  print("\nOutlier Check (based on IQR, numeric columns):")
  numeric_cols = df.select_dtypes(include=[np.number]).columns
  for col in numeric_cols:
      q1 = df[col].quantile(0.25)
      q3 = df[col].quantile(0.75)
      iqr = q3 - q1

      outliers = df[(df[col] < (q1 - 1.5 * iqr)) | (df[col] > (q3 + 1.5 * iqr))]
      print(f"{col}: {len(outliers)} potential outliers")

  print("="*50)

# Inspecting all the datasets
inspect_dataset(mbti, "MBTI Dataset")
inspect_dataset(onet_salaries, "O*NET Salaries Dataset")
inspect_dataset(onet_human, "O*NET Human Characteristics Dataset")

/n=====MBTI Dataset INSPECTION=====
Shape: (8675, 2)
Columns: Index(['type', 'posts'], dtype='object')
Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8675 entries, 0 to 8674
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   type    8675 non-null   object
 1   posts   8675 non-null   object
dtypes: object(2)
memory usage: 135.7+ KB
None

Head:
   type                                              posts
0  INFJ  'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1  ENTP  'I'm finding the lack of me in these posts ver...
2  INTP  'Good one  _____   https://www.youtube.com/wat...
3  INTJ  'Dear INTP,   I enjoyed our conversation the o...
4  ENTJ  'You're fired.|||That's another silly misconce...

Missing Values per Column:
type     0
posts    0
dtype: int64

Summary Statistics:(numeric columns):
        type                                              posts
count   8675                                               8675
uni

In [24]:
###TASK 2: Standardization Diagnostics:###
# Step 1: Align Column Names
def standardize_columns(df):
    df.columns = (
        df.columns
        .str.strip()
        .str.lower()
        .str.replace(" ", "_")
        .str.replace("-", "_")
    )
    return df

onet_salaries = standardize_columns(onet_salaries)
onet_human = standardize_columns(onet_human)
onet_complete = standardize_columns(onet_complete)

# Reorder columns to match complete dataset (base schema)
onet_salaries = onet_salaries.reindex(columns=onet_complete.columns.intersection(onet_salaries.columns))
onet_human = onet_human.reindex(columns=onet_complete.columns)

# Step 2: Identify Redundancy
def compare_columns(base, other, name):
    base_cols = set(base.columns)
    other_cols = set(other.columns)

    missing = base_cols - other_cols
    extra = other_cols - base_cols

    print(f"\n Schema check for {name}:")
    if missing:
        print(f"  - Missing columns: {missing}")
    else:
        print("  - No missing columns compared to base.")
    if extra:
        print(f"  - Extra columns: {extra}")
    else:
        print("  - No extra columns compared to base.")

compare_columns(onet_complete, onet_salaries, "O*NET Salaries")
compare_columns(onet_complete, onet_human, "O*NET Human Characteristics")

print(" The other ONET datasets are redundant once alignment is done.")

# Step 3: Drop Duplicates
# Keep only the complete dataset as the single source of truth
onet_standardized = onet_complete.drop_duplicates()

# Save Cleaned Dataset
output_path = "dataset/onet_standardized.csv"
onet_standardized.to_csv(output_path, index=False)

print("\n Cleaning pipeline complete.")
print(" Using `complete_onet_data_with_human_characteristics` as the single source of truth.")
print(f" Final shape: {onet_standardized.shape} (rows, columns)")
print(f" Final dataset saved at: {output_path}")

# Preview first rows
print("\n Preview of standardized ONET dataset:")
display(onet_standardized.head())


 Schema check for O*NET Salaries:
  - Missing columns: {'human_characteristics'}
  - No extra columns compared to base.

 Schema check for O*NET Human Characteristics:
  - No missing columns compared to base.
  - No extra columns compared to base.
 The other ONET datasets are redundant once alignment is done.

 Cleaning pipeline complete.
 Using `complete_onet_data_with_human_characteristics` as the single source of truth.
 Final shape: (1016, 30) (rows, columns)
 Final dataset saved at: dataset/onet_standardized.csv

 Preview of standardized ONET dataset:


Unnamed: 0,occupation_code,title,human_characteristics,description,sample_job_titles,tasks,knowledge,skills,abilities,work_activities,...,work_styles,work_values,work_context,additional_sources,related_occupations,annual_10th_percentile,annual_25th_percentile,annual_median_wage,annual_75th_percentile,annual_90th_percentile
0,13-2011.00,Accountants and Auditors,"Attention to Detail, Analytical Thinking, Inte...","Examine, analyze, and interpret accounting rec...","Accountant, Accounting Officer, Audit Partner,...","Prepare detailed reports on audit findings., R...","Economics and Accounting, English Language, Ma...","Reading Comprehension, Active Listening, Criti...","Oral Comprehension, Oral Expression, Written C...","Getting Information, Communicating with Superv...",...,"Attention to Detail, Integrity, Dependability,...","Achievement, Independence, Recognition","E-Mail, Telephone Conversations, Face-to-Face ...","AACSB, AICPA and CIMA, American Accounting Ass...","Bookkeeping, Accounting, and Auditing Clerks (...",50440.0,62720.0,79880.0,103990.0,137280.0
1,27-2011.00,Actors,"Communication, Adaptability, Problem Solving, ...","Play parts in stage, television, radio, video,...","Actor, Actress, Comedian, Comic, Community The...",Collaborate with other actors as part of an en...,"Fine Arts, English Language, Communications an...","Reading Comprehension, Speaking, Active Listen...","Oral Expression, Oral Comprehension, Memorizat...",Establishing and Maintaining Interpersonal Rel...,...,"Cooperation, Persistence, Adaptability/Flexibi...","Relationships, Achievement, Independence",Work With or Contribute to a Work Group or Tea...,"Actors' Equity Association, American Associati...","Choreographers (27-2032.00), Music Directors a...",,,,,
2,15-2011.00,Actuaries,"Communication, Adaptability, Problem Solving, ...","Analyze statistical data, such as mortality, a...","Actuarial Analyst, Actuarial Associate, Actuar...",Ascertain premium rates required and cash rese...,"Mathematics, Computers and Electronics, Econom...","Critical Thinking, Judgment and Decision Makin...","Mathematical Reasoning, Inductive Reasoning, N...","Analyzing Data or Information, Processing Info...",...,"Analytical Thinking, Attention to Detail, Inte...","Working Conditions, Achievement, Independence","E-Mail, Indoors, Environmentally Controlled, S...","American Academy of Actuaries, Casualty Actuar...","Accountants and Auditors (13-2011.00), Compens...",75380.0,88420.0,120000.0,164320.0,209310.0
3,29-1291.00,Acupuncturists,"Communication, Adaptability, Problem Solving, ...","Diagnose, treat, and prevent disorders by stim...","Acupuncture Physician, Acupuncture Provider, A...",Develop individual treatment plans and strateg...,"Medicine and Dentistry, Customer and Personal ...","Active Listening, Critical Thinking, Social Pe...","Deductive Reasoning, Oral Comprehension, Oral ...","Assisting and Caring for Others, Updating and ...",...,"Integrity, Dependability, Self-Control, Attent...","Achievement, Independence, Relationships","Indoors, Environmentally Controlled, Physical ...",American Association of Acupuncture and Orient...,"Cardiologists (29-1212.00), Chiropractors (29-...",41600.0,52000.0,78220.0,99740.0,140660.0
4,29-1141.01,Acute Care Nurses,"Compassion, Patience, Attention to Detail, Str...",Provide advanced nursing care for patients wit...,"Cardiac Interventional Care Nurse, Charge Nurs...","Perform emergency medical procedures, such as ...","Medicine and Dentistry, Customer and Personal ...","Active Listening, Critical Thinking, Monitorin...","Oral Comprehension, Problem Sensitivity, Deduc...","Assisting and Caring for Others, Documenting/R...",...,"Integrity, Stress Tolerance, Attention to Deta...","Relationships, Support, Achievement","Exposed to Disease or Infections, Telephone Co...","American Association of Colleges of Nursing, A...",Advanced Practice Psychiatric Nurses (29-1141....,63720.0,75990.0,86070.0,104670.0,132680.0


In [25]:
### TASK 3: Clean data, remove stopwords and tokenize text ###
import pandas as pd
import numpy as np
import re
import string
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(ENGLISH_STOP_WORDS)

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = text.split()  # instead of word_tokenize
    tokens = [w for w in tokens if w not in stop_words]
    return " ".join(tokens)

mbti['cleaned_text'] = mbti['posts'].apply(clean_text)

# TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2), min_df=5)
X = vectorizer.fit_transform(mbti['cleaned_text'])
y = mbti['type']

mbti

# Split the 'posts' column by '|'
#mbti_expanded = mbti.assign(posts=mbti['posts'].str.split('|')).explode('posts')

# Clean the text again
#mbti_expanded['cleaned_text'] = mbti_expanded['posts'].apply(clean_text)

#mbti_expanded.head()


[nltk_data] Downloading package punkt to /Users/srahman/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/srahman/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Unnamed: 0,type,posts,cleaned_text
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,intj moments sportscenter plays prankswhat lif...
1,ENTP,'I'm finding the lack of me in these posts ver...,im finding lack posts alarmingsex boring posit...
2,INTP,'Good one _____ https://www.youtube.com/wat...,good course say know thats blessing cursedoes ...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",dear intp enjoyed conversation day esoteric ga...
4,ENTJ,'You're fired.|||That's another silly misconce...,youre firedthats silly misconception approachi...
...,...,...,...
8670,ISFP,'https://www.youtube.com/watch?v=t8edHB_h908||...,just think cats fi doms reason websites haven ...
8671,ENFP,'So...if this thread already exists someplace ...,soif thread exists someplace does heck delete ...
8672,INTP,'So many questions when i do these things. I ...,questions things purple pill pick winning lott...
8673,INFP,'I am very conflicted right now when it comes ...,conflicted right comes wanting children honest...


In [None]:
###TASK 4: Categorical Encoding###
