In [None]:
# Define a threshold for frequency under which categories are grouped as "Other"
threshold = 0.01  # 1%
total_count = len(preprocessed_df)
value_counts = preprocessed_df['What programming language would you recommend an aspiring data scientist to learn first?'].value_counts(normalize=True)

# Group rare categories below threshold frequency
preprocessed_df['What programming language would you recommend an aspiring data scientist to learn first?'] = preprocessed_df['What programming language would you recommend an aspiring data scientist to learn first?'].apply(lambda x: x if value_counts[x] >= threshold else 'Other')

# Check the new distribution
print(preprocessed_df['What programming language would you recommend an aspiring data scientist to learn first?'].value_counts(normalize=True))

In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA

# Step 1: Group Low-Frequency Categories
threshold = 0.01  # 1% threshold
value_counts = preprocessed_df['What programming language would you recommend an aspiring data scientist to learn first?'].value_counts(normalize=True)
preprocessed_df['What programming language would you recommend an aspiring data scientist to learn first?'] = preprocessed_df['What programming language would you recommend an aspiring data scientist to learn first?'].apply(
    lambda x: x if value_counts[x] >= threshold else 'Other'
)

# Step 2: One-Hot Encode the Grouped Data
encoder = OneHotEncoder(sparse=False)
encoded_data = encoder.fit_transform(preprocessed_df[['What programming language would you recommend an aspiring data scientist to learn first?']])

# Convert encoded data to DataFrame for easier handling (optional)
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out())

# Step 3: Apply PCA for Dimensionality Reduction
# Adjust n_components based on the level of reduction desired
pca = PCA(n_components=2)
pca_data = pca.fit_transform(encoded_df)

# Convert PCA results to a DataFrame and concatenate with original DataFrame if needed
pca_df = pd.DataFrame(pca_data, columns=['PCA1', 'PCA2'])
preprocessed_df = pd.concat([preprocessed_df, pca_df], axis=1)

# Check the result
preprocessed_df.head()

In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA

# Step 1: Check for and handle missing values in the relevant column
preprocessed_df['What programming language would you recommend an aspiring data scientist to learn first?'].fillna('Unknown', inplace=True)

# Step 2: Group Low-Frequency Categories
threshold = 0.01  # 1% threshold
value_counts = preprocessed_df['What programming language would you recommend an aspiring data scientist to learn first?'].value_counts(normalize=True)
preprocessed_df['What programming language would you recommend an aspiring data scientist to learn first?'] = preprocessed_df['What programming language would you recommend an aspiring data scientist to learn first?'].apply(
    lambda x: x if value_counts[x] >= threshold else 'Other'
)

# Step 3: One-Hot Encode the Grouped Data
encoder = OneHotEncoder(sparse=False)
encoded_data = encoder.fit_transform(preprocessed_df[['What programming language would you recommend an aspiring data scientist to learn first?']])

# Convert encoded data to DataFrame and ensure row alignment
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out())
encoded_df.index = preprocessed_df.index  # align indices with the original DataFrame

# Step 4: Apply PCA for Dimensionality Reduction
pca = PCA(n_components=2)
pca_data = pca.fit_transform(encoded_df)

# Convert PCA results to DataFrame and align indices
pca_df = pd.DataFrame(pca_data, columns=['PCA1', 'PCA2'], index=preprocessed_df.index)

# Step 5: Concatenate PCA results with original DataFrame
preprocessed_df = pd.concat([preprocessed_df, pca_df], axis=1)

# Check the result for any increase in row count or unexpected NaNs
preprocessed_df.shape
print(preprocessed_df.isna().sum())

In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

# Step 1: Frequency Encode the Categorical Column
# Get the counts for each category
value_counts = preprocessed_df['What programming language would you recommend an aspiring data scientist to learn first?'].value_counts()

# Map each category to its frequency count
preprocessed_df['language_freq'] = preprocessed_df['What programming language would you recommend an aspiring data scientist to learn first?'].map(value_counts)

# Step 2: Apply Log Transformation to Reduce Skew
# This will reduce the dominance of the majority class by scaling down large counts
preprocessed_df['language_freq'] = np.log1p(preprocessed_df['language_freq'])

# Step 3: Apply PCA (if needed for dimensionality reduction)
# Here we only have one feature (language_freq), so PCA is not necessary. However, if you had multiple columns,
# you could combine them and apply PCA here.

# Step 4: Drop the original categorical column
preprocessed_df = preprocessed_df.drop(columns=['What programming language would you recommend an aspiring data scientist to learn first?'])

# Check the new distribution
print(preprocessed_df['language_freq'].describe())
print(preprocessed_df.head())

In [None]:
# Header contains space and inproper naming convention usage
from dataprep.clean import clean_headers

# Format the data headers to snakecase 
# preprocessed_df = clean_headers(preprocessed_df, case="snake") , remove_accents=False)
preprocessed_df = clean_headers(preprocessed_df, remove_accents=False) 

# Check for the formatted columns names 
print(preprocessed_df.columns)

In [None]:
# Feature Creation
import pandas as pd
import re

# Convert data to DataFrame
df["Selections"] = preprocessed_df['Who/what are your favorite media sources that report on data science topics?']

# Step 1: Remove text within parentheses and split options
df["Selections_Clean"] = df["Selections"].apply(lambda x: re.sub(r'\s*\([^)]*\)", "', x).split(", "))

# Step 2: Get unique options across all rows
unique_options = set(option.strip() for row in df["Selections_Clean"] for option in row)

# Step 3: Create a column for each unique option and mark as 'yes' or 'no'
for option in unique_options:
    df[option] = df["Selections_Clean"].apply(lambda x: "yes" if option in x else "no")

# Drop the intermediate cleaned column if not needed
df.drop(columns=["Selections_Clean"], inplace=True)

# Display the final DataFrame
print(df)

Adding as features

In [None]:
# Step 1: Simplify the responses by extracting the main category
df['Simplified_Sources'] = preprocessed_df['Who/what are your favorite media sources that report on data science topics?'].str.replace(r"\s*\(.*?\)", "")

# Step 2: Split the sources by delimiter to get lists of sources
df['Source_List'] = df['Simplified_Sources'].str.split(', ')
df['Source_List'] 

# Convert lists to strings
df['Source_List'] = df['Source_List'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)

# Assign to preprocessed_df
preprocessed_df['Who/what are your favorite media sources that report on data science topics?'] = df['Simplified_Sources']
preprocessed_df['Who/what are your favorite media sources that report on data science topics?'] 

# Step 3: Get unique sources to create columns
unique_sources = set([source for sources in df['Source_List'] for source in sources])
unique_sources

# Step 4: Create binary columns for each unique source
for source in unique_sources:
    preprocessed_df[source] = df['Source_List'].apply(lambda x: 1 if source in x else 0)

In [None]:
# One-hot encode the column
company_size_dummies = pd.get_dummies(preprocessed_df['What is the size of the company where you are employed?'], prefix="Company_Size")

# Concatenate the one-hot encoded columns with the original DataFrame
df_test = pd.concat([preprocessed_df, company_size_dummies], axis=1)

In [None]:
import numpy as np
import pandas as pd

# Function to split and clean min and max compensation
def split_compensation(df, column):
    # Copy the DataFrame to avoid modifying the original
    df = df.copy()
    
    # Create two new columns for min and max compensation
    df.loc[:, 'Minimum Compensation (USD)'] = df[column].str.extract(r'\$?(\d+[,]*\d*)')[0].str.replace(',', '')
    df.loc[:, 'Maximum Compensation (USD)'] = df[column].str.extract(r'-(\d+[,]*\d*)')[0].str.replace(',', '')
    
    # Handle cases with only a minimum (e.g., "$100,000+")
    df.loc[:, 'Maximum Compensation (USD)'] = np.where(df['Maximum Compensation (USD)'].isna(), np.nan, df['Maximum Compensation (USD)'])
    
    # Convert the min and max compensation to integers, handling NaN values
    df.loc[:, 'Minimum Compensation (USD)'] = df['Minimum Compensation (USD)'].fillna(0).astype(int)
    df.loc[:, 'Maximum Compensation (USD)'] = df['Maximum Compensation (USD)'].fillna(0).astype(int)
    
    # Drop the original column
    # df.drop(columns=[column], inplace=True)
    
    return df

# Apply the function to the dataframe
preprocessed_df = split_compensation(preprocessed_df, 'What is your current yearly compensation (approximate $USD)?')

# Display the transformed DataFrame
print(preprocessed_df)