In [2]:
# ================================
# Data Science Salaries Analysis
# ================================

import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# ---------------------------
# Load Dataset
# ---------------------------
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Data/datascience_salaries.csv")

print("--- Preview Dataset ---")
print(df.head())

print("\n--- Dataset Info ---")
print(df.info())

# ---------------------------
# 1. Normalize salary column
# ---------------------------
print("\n--- Normalizing Salary ---")
scaler = MinMaxScaler()
df['salary_normalized'] = scaler.fit_transform(df[['salary']]).ravel()
print(df[['salary', 'salary_normalized']].head())

# ---------------------------
# 2. Dimensionality Reduction
# ---------------------------
print("\n--- Applying PCA ---")
# Select only numeric columns for PCA
numeric_df = df.select_dtypes(include=['int64','float64'])

# Apply PCA to reduce to 2 components
pca = PCA(n_components=2)
pca_result = pca.fit_transform(numeric_df)

df['PCA1'] = pca_result[:,0]
df['PCA2'] = pca_result[:,1]

print("Explained variance ratio:", pca.explained_variance_ratio_)
print(df[['PCA1','PCA2']].head())

# (Optional) Try t-SNE for visualization (slower than PCA)
# tsne = TSNE(n_components=2, random_state=42)
# tsne_result = tsne.fit_transform(numeric_df)
# df['tSNE1'] = tsne_result[:,0]
# df['tSNE2'] = tsne_result[:,1]

# ---------------------------
# 3. Group by experience_level
# ---------------------------
print("\n--- Grouping by experience_level ---")
salary_stats = df.groupby('experience_level')['salary'].agg(['mean','median'])
print(salary_stats)

# ---------------------------
# Final Check
# ---------------------------
print("\n--- Final Dataset Preview ---")
print(df.head())


--- Preview Dataset ---
   Unnamed: 0       job_title   job_type experience_level       location  \
0           0  Data scientist  Full Time           Senior  New York City   
1           2  Data scientist  Full Time           Senior         Boston   
2           3  Data scientist  Full Time           Senior         London   
3           4  Data scientist  Full Time           Senior         Boston   
4           5  Data scientist  Full Time           Senior  New York City   

  salary_currency  salary  
0             USD  149000  
1             USD  120000  
2             USD   68000  
3             USD  120000  
4             USD  149000  

--- Dataset Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1171 entries, 0 to 1170
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        1171 non-null   int64 
 1   job_title         1171 non-null   object
 2   job_type          1171 non-null   