# Task 1

In [6]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Load your dataset (replace 'DataAnalyst.csv' with the actual path of your dataset)
df = pd.read_csv('/Users/ananyasmaranikadivendi/Downloads/ds_salaries.csv')

# Function to create a profile table with column name, data type, unique values, min, and max
def create_profile_table(df):
    profile_table = pd.DataFrame()

    # Getting column names and data types
    profile_table['Column Name'] = df.columns
    profile_table['Data Type'] = df.dtypes.values

    # Unique values for each column
    profile_table['Unique Values'] = df.nunique().values

    # Initialize Min and Max columns with NaN
    profile_table['Min'] = np.nan
    profile_table['Max'] = np.nan

    # Min and Max for numerical columns only
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    profile_table.loc[profile_table['Column Name'].isin(numeric_cols), 'Min'] = df[numeric_cols].min().values
    profile_table.loc[profile_table['Column Name'].isin(numeric_cols), 'Max'] = df[numeric_cols].max().values

    return profile_table

# Generate the profile table
profile_table = create_profile_table(df)

# Display the profile table
print(profile_table)

# Save the profile table to a CSV file if needed
profile_table.to_csv('profile_table.csv', index=False)


           Column Name Data Type  Unique Values     Min         Max
0            work_year     int64              4  2020.0      2023.0
1     experience_level    object              4     NaN         NaN
2      employment_type    object              4     NaN         NaN
3            job_title    object             93     NaN         NaN
4               salary     int64            815  6000.0  30400000.0
5      salary_currency    object             20     NaN         NaN
6        salary_in_usd     int64           1035  5132.0    450000.0
7   employee_residence    object             78     NaN         NaN
8         remote_ratio     int64              3     0.0       100.0
9     company_location    object             72     NaN         NaN
10        company_size    object              3     NaN         NaN


# Task 3

# Code to handle Outliners

In [13]:
# Import necessary libraries
import pandas as pd
import numpy as np
from scipy import stats

# Load your dataset (replace 'DataAnalyst.csv' with the actual path of your dataset)
df = pd.read_csv('/Users/ananyasmaranikadivendi/Downloads/ds_salaries.csv')

# Function to detect and handle outliers
def handle_outliers(df, method='IQR', remove=True):
    df_cleaned = df.copy()
    
    if method == 'IQR':
        # Using Interquartile Range (IQR) method to detect outliers
        for column in df_cleaned.select_dtypes(include=[np.number]).columns:
            Q1 = df_cleaned[column].quantile(0.25)
            Q3 = df_cleaned[column].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            
            if remove:
                # Removing outliers
                df_cleaned = df_cleaned[(df_cleaned[column] >= lower_bound) & (df_cleaned[column] <= upper_bound)]
            else:
                # Capping outliers
                df_cleaned[column] = np.where(df_cleaned[column] > upper_bound, upper_bound, 
                                              np.where(df_cleaned[column] < lower_bound, lower_bound, df_cleaned[column]))
    
    elif method == 'Z-score':
        # Using Z-score method to detect outliers
        z_scores = np.abs(stats.zscore(df_cleaned.select_dtypes(include=[np.number])))
        outliers = (z_scores > 3).any(axis=1)
        
        if remove:
            # Removing outliers
            df_cleaned = df_cleaned[~outliers]
        else:
            # Capping outliers (Z-score based capping)
            for column in df_cleaned.select_dtypes(include=[np.number]).columns:
                mean = df_cleaned[column].mean()
                std_dev = df_cleaned[column].std()
                df_cleaned[column] = np.clip(df_cleaned[column], mean - 3 * std_dev, mean + 3 * std_dev)
    
    return df_cleaned

# Handle outliers using IQR method and removing them
df_cleaned = handle_outliers(df, method='IQR', remove=True)

# Display cleaned data
print(df_cleaned)

# Save the cleaned dataset to a CSV file if needed
df_cleaned.to_csv('DataAnalyst_cleaned.csv', index=False)


      work_year experience_level employment_type                 job_title  \
0          2023               SE              FT  Principal Data Scientist   
1          2023               MI              CT               ML Engineer   
2          2023               MI              CT               ML Engineer   
3          2023               SE              FT            Data Scientist   
4          2023               SE              FT            Data Scientist   
...         ...              ...             ...                       ...   
3745       2021               SE              FT  Director of Data Science   
3746       2021               MI              FT            Data Scientist   
3748       2021               MI              FT             Data Engineer   
3749       2021               SE              FT           Data Specialist   
3751       2021               MI              FT  Principal Data Scientist   

      salary salary_currency  salary_in_usd employee_residence 