# KULLANILABILIR FONKSYIONLAR

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import cufflinks as cf
%matplotlib inline 

from scipy import stats
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.preprocessing import PowerTransformer, OneHotEncoder, LabelEncoder 
from sklearn.pipeline import Pipeline


from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import PrecisionRecallDisplay, roc_curve, average_precision_score, precision_recall_curve
from sklearn.metrics import RocCurveDisplay, roc_auc_score, auc
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay


from yellowbrick.regressor import ResidualsPlot, PredictionError

import warnings
warnings.filterwarnings("ignore")

## User Defined Funcs for Summary of Data

In [None]:
# =============== User-Defined-Function ==========================

def rename_columns_by_position(df, new_column_names):
    """
    DataFrame'deki sütun adlarını pozisyona göre yeniden adlandırır.

    Parameters:
        df (pd.DataFrame): Sütun adlarını değiştirmek istediğiniz DataFrame.
        new_column_names (list): Yeni sütun adlarının listesi.

    Returns:
        pd.DataFrame: Yeni sütun adlarıyla güncellenmiş DataFrame.
    """
    # Sütun sayısının uyumlu olup olmadığını kontrol et
    if len(new_column_names) != len(df.columns):
        raise ValueError("Sütun sayısı ile yeni adların sayısı eşleşmiyor.")
    
    # Yeni sütun adlarını atama
    df.columns = new_column_names   
    return df


#================ Categorical Features Summary ===================
def object_summary(df):
    obs = df.shape[0]
    duplicate_count = df.duplicated().sum()

    # Kategorik sütunlar için özetleme
    object_df = df.select_dtypes(include='object')
    
    # Yeni bir boş DataFrame oluşturma
    summary_df = pd.DataFrame(index=object_df.columns)

    summary_df['Dtype'] = object_df.dtypes
    summary_df['Counts'] = object_df.count()
    summary_df['Nulls'] = object_df.isnull().sum()
    summary_df['NullPercent'] = (object_df.isnull().sum() / obs) * 100
    summary_df['Top'] = object_df.apply(lambda x: x.mode().iloc[0] if not x.mode().empty else '-')
    summary_df['Frequency'] = object_df.apply(lambda x: x.value_counts().max() if not x.value_counts().empty else '-')
    summary_df['Uniques'] = object_df.nunique()

    # UniqueValues sütununu kontrol ederek ekleme (dize olarak)
    summary_df['UniqueValues'] = object_df.apply(
        lambda x: ', '.join(map(str, x.unique()[:10])) + '...' if x.nunique() > 10 else ', '.join(map(str, x.unique()))
    )

    # DataFrame şekli ve tekrar eden satır sayısını ekrana yazdırma
    print(f'1. Data shape (rows, columns): {df.shape}')
    print(f'2. Number of duplicate rows: {duplicate_count}')
    return summary_df


#================ Numerical Features Summary ===================
def numeric_summary(df):
    obs = df.shape[0]
    duplicate_count = df.duplicated().sum()

    # Numerik sütunlar için özetleme
    numeric_df = df.select_dtypes(include=['float64', 'int64'])

    # Yeni bir boş DataFrame oluşturma
    summary_df = pd.DataFrame(index=numeric_df.columns)

    summary_df['Dtype'] = numeric_df.dtypes
    summary_df['Counts'] = numeric_df.count()
    summary_df['Nulls'] = numeric_df.isnull().sum()
    summary_df['NullPercent'] = (numeric_df.isnull().sum() / obs) * 100
    summary_df['Mean'] = numeric_df.mean()
    summary_df['Std'] = numeric_df.std()
    summary_df['Min'] = numeric_df.min()
    summary_df['25%'] = numeric_df.quantile(0.25)
    summary_df['50% (Median)'] = numeric_df.median()
    summary_df['75%'] = numeric_df.quantile(0.75)
    summary_df['Max'] = numeric_df.max()

    # DataFrame şekli ve tekrar eden satır sayısını ekrana yazdırma
    print(f'1. Data shape (rows, columns): {df.shape}')
    print(f'2. Number of duplicate rows: {duplicate_count}')
    return summary_df
    

#========== Get count and percentage of values for each column =================
def get_value_count(df, column_name):
    """
    This function calculates and returns a DataFrame with the value counts and 
    their corresponding percentages for a specified column in the DataFrame.
    """
    
    vc = df[column_name].value_counts()
    vc_norm = df[column_name].value_counts(normalize=True)
    
    vc = vc.rename_axis(column_name).reset_index(name='counts')
    vc_norm = vc_norm.rename_axis(column_name).reset_index(name='percent')
    vc_norm['percent'] = (vc_norm['percent'] * 100).map('{:.2f}%'.format)
    
    df_result = pd.concat([vc[column_name], vc['counts'], vc_norm['percent']], axis=1)
    return df_result


#============== Checks duplicates and drops them ==========================

def duplicate_values(df):
    print("Duplicate check...")
    num_duplicates = df.duplicated(subset=None, keep='first').sum()
    if num_duplicates > 0:
        print("There are", num_duplicates, "duplicated observations in the dataset.")
        df.drop_duplicates(keep='first', inplace=True)
        print(num_duplicates, "duplicates were dropped!")
        print("No more duplicate rows!")
    else:
        print("There are no duplicated observations in the dataset.")



# ========== User-Defined-Function for Missing Values ============
def missing_values(df):
    """This function calculates the missing values count and their percentage in a DataFrame."""

    missing_count = df.isnull().sum()
    value_count = df.isnull().count()
    missing_percentage = round(missing_count / value_count * 100, 2)
    
    # Format the percentage as '0.00%' with % symbol
    missing_percentage_formatted = missing_percentage.map("{:.2f}%".format)
    # Create a DataFrame to store the results
    missing_df = pd.DataFrame({"count": missing_count, "percentage": missing_percentage_formatted}) 
    return missing_df


# ========== Plotting Missing Values  ===========================
def na_ratio_plot(df):
    """Plots the ratio of missing values for each feature and prints the count of missing values."""
    
    sns.displot(df.isna().melt(value_name='Missing_data',var_name='Features')\
                ,y='Features',hue='Missing_data',multiple='fill',aspect=9/8)

    print(df.isna().sum()[df.isna().sum()>0])

    
    
#========== Detecting Anomalies ================================

def detect_anomalies(df, column_name):
    """
    Detects values with unusual (non-alphanumeric) characters in a column.
    Returns: list: Detected unusual character values.
    """
    # Get the unique values in the column
    unique_values = df[column_name].unique()    
    # Detect values with unusual characters (non-alphanumeric)
    unusual_characters = [val for val in unique_values if isinstance(val, str) and not val.isalnum()]
    
    # Return the list as a single string with values separated by commas
    return ', '.join(unusual_characters)


#========== Detecting Non-Numerical Characters ===========================

import re

def find_non_numeric_values(df, column_name):
    """
    Finds unique non-numeric values in a specified column of the DataFrame.
    """
    pattern = r'\D+'  # Pattern to match non-numeric characters
    # Find and flatten non-numeric values, then ensure uniqueness with set
    return set(re.findall(pattern, ' '.join(df[column_name].astype(str))))


#========================================================================
#======================================================================

In [None]:
# =============== User-Defined-Function ==========================

#======= Get count and percentage of values for each column ======
def get_value_count(df, column_name):
    """
    This function calculates and returns a DataFrame with the value counts and 
    their corresponding percentages for a specified column in the DataFrame.
    """
    
    vc = df[column_name].value_counts()
    vc_norm = df[column_name].value_counts(normalize=True)
    
    vc = vc.rename_axis(column_name).reset_index(name='counts')
    vc_norm = vc_norm.rename_axis(column_name).reset_index(name='percent')
    vc_norm['percent'] = (vc_norm['percent'] * 100).map('{:.2f}%'.format)
    
    df_result = pd.concat([vc[column_name], vc['counts'], vc_norm['percent']], axis=1)
    
    return df_result



# ========== User-Defined-Function for Missing Values ============
def missing_values(df):
    """This function calculates the missing values count and their percentage in a DataFrame."""

    missing_count = df.isnull().sum()
    value_count = df.isnull().count()
    missing_percentage = round(missing_count / value_count * 100, 2)
    
    # Format the percentage as '0.00%' with % symbol
    missing_percentage_formatted = missing_percentage.map("{:.2f}%".format)
    # Create a DataFrame to store the results
    missing_df = pd.DataFrame({"count": missing_count, "percentage": missing_percentage_formatted})
    
    return missing_df


# ============= Compare Missing Values (Train-Test ==============
def compare_missing_values(train, test):
    """
    Compares missing values between train and test datasets, returning counts, percentages, and data types.
    """
    def missing_data(df, label):
        missing_count = df.isna().sum()[df.isna().sum() > 0]
        total_count = len(df)
        missing_percentage = (missing_count / total_count * 100).map("{:.2f}%".format)
        return pd.DataFrame({
            f'{label} Missing Values': missing_count,
            f'{label} Missing Percentage': missing_percentage,
            f'{label} dtypes': df.dtypes[missing_count.index]
        })
    
    # Get missing data for train and test
    train_missing_df = missing_data(train, 'Train')
    test_missing_df = missing_data(test, 'Test')
    
    # Concatenate the missing values side by side
    return pd.concat([train_missing_df, test_missing_df], axis=1)


# ========== Plotting Missing Values  ===========================
def na_ratio_plot(df):
    """Plots the ratio of missing values for each feature and prints the count of missing values."""
    
    sns.displot(df.isna().melt(value_name='Missing_data',var_name='Features')\
                ,y='Features',hue='Missing_data',multiple='fill',aspect=9/8)

    print(df.isna().sum()[df.isna().sum()>0])

    
    
#========== Detecting Anomalies ================================

def detect_anomalies(df, column_name):
    """
    Detects values with unusual (non-alphanumeric) characters in a column.
    Returns: list: Detected unusual character values.
    """
    # Get the unique values in the column
    unique_values = df[column_name].unique()    
    # Detect values with unusual characters (non-alphanumeric)
    unusual_characters = [val for val in unique_values if isinstance(val, str) and not val.isalnum()]
    
    # Return the list as a single string with values separated by commas
    return ', '.join(unusual_characters)


#========== Detecting Non-Numerical Characters ===========================

import re

def find_non_numeric_values(df, column_name):
    """
    Finds unique non-numeric values in a specified column of the DataFrame.
    """
    pattern = r'\D+'  # Pattern to match non-numeric characters
    # Find and flatten non-numeric values, then ensure uniqueness with set
    return set(re.findall(pattern, ' '.join(df[column_name].astype(str))))


#=============== Clean_Unusual_Characters ===========================

import re
import numpy as np

def clean_and_convert_numeric(df, column_name):
    """
    This function cleans non-numeric characters from a specified column, 
    converts the column to float, and handles negative values.

    Parameters:
    df (DataFrame): The input DataFrame.
    column_name (str): The name of the column to clean and convert.

    Returns:
    DataFrame: The DataFrame with the cleaned and converted column.
    """
    # 1. Remove non-numeric characters from the specified column
    df[column_name] = df[column_name].apply(lambda x: re.sub(r'[^0-9.]', '', str(x)))
    
    # 2. Replace empty strings with NaN
    df[column_name].replace('', np.nan, inplace=True)
    
    # 3. Convert the column to float
    df[column_name] = df[column_name].astype(float)
    
    # 4. Convert negatives to positives (absolute values)
    df[column_name] = df[column_name].abs()
    
    return df


# ============== User-Defined-Fonction ======================

from sklearn.impute import KNNImputer

def knn_impute_column(df, column, n_neighbors=5):
    """
    Impute missing values in the specified column using KNN.
    Args: df (DataFrame), column (str), n_neighbors (int): Number of neighbors (Default is 5).
    Returns: DataFrame: DataFrame with imputed column.
    """
    # Apply KNN imputation to the specified column
    imputer = KNNImputer(n_neighbors=n_neighbors)
    df[[column]] = imputer.fit_transform(df[[column]])
    
    return df
# ============================================================


## Duplicates

In [None]:
# Checks duplicates and drops them

def duplicate_values(df):
    print("Duplicate check...")
    num_duplicates = df.duplicated(subset=None, keep='first').sum()
    if num_duplicates > 0:
        print("There are", num_duplicates, "duplicated observations in the dataset.")
        df.drop_duplicates(keep='first', inplace=True)
        print(num_duplicates, "duplicates were dropped!")
        print("No more duplicate rows!")
    else:
        print("There are no duplicated observations in the dataset.")

duplicate_values(df_otoklav_18)

In [None]:
# Let's observe first the unique values

def get_unique_values(df):
    
    output_data = []

    for col in df.columns:

        # If the number of unique values in the column is less than or equal to 5
        if df.loc[:, col].nunique() <= 10:
            # Get the unique values in the column
            unique_values = df.loc[:, col].unique()
            # Append the column name, number of unique values, unique values, and data type to the output data
            output_data.append([col, df.loc[:, col].nunique(), unique_values, df.loc[:, col].dtype])
        else:
            # Otherwise, append only the column name, number of unique values, and data type to the output data
            output_data.append([col, df.loc[:, col].nunique(),"-", df.loc[:, col].dtype])

    output_df = pd.DataFrame(output_data, columns=['Column Name', 'Number of Unique Values', ' Unique Values ', 'Data Type'])

    return output_df

## Missing

In [None]:
def missing_values(df):

    missing_count = df.isnull().sum()
    value_count = df.isnull().count()
    missing_percentage = round(missing_count / value_count * 100, 2)
    missing_df = pd.DataFrame({"count": missing_count, "percentage": missing_percentage})
    return missing_df

missing_values(df)

In [None]:
# Fonction for counting and normalizing values in the column

def value_cnt_fonc(df, column_name):
    vc = df[column_name].value_counts()
    vc_norm = df[column_name].value_counts(normalize=True)

    vc = vc.rename_axis(column_name).reset_index(name='counts')
    vc_norm = vc_norm.rename_axis(column_name).reset_index(name='norm_counts')

    df_result = pd.concat([vc[column_name], vc['counts'], vc_norm['norm_counts']], axis=1)
    
    return df_result

## Distributions

In [None]:
# DISTRIBUTIONS OF CATEGORICAL FEATURES;

for column in cat_features:
    plt.figure(figsize=(8, 6))
    ax = sns.countplot(x=column, data=df, palette='BuPu')
    plt.title(f'Distribution of Categories {column}')

    ax.bar_label(ax.containers[0])

    plt.xticks(rotation=90)
    plt.show()

In [None]:
# CATEGORICALS FETATURES BY TARGET (income <=50k | income >50k)

for i in cat_features:
    fig, ax = plt.subplots(figsize=(20, 5))
    sns.countplot(ax=ax, data=df, x=i, hue="income", palette='BuPu')
    ax.set(ylabel='Counts', title=i)

    for j in [0, 1]:
        ax.bar_label(ax.containers[j])

    plt.xticks(rotation=45)

plt.show()

In [None]:
# Selecting categorical data for univariate analysis:

cats = ['Survived', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']

def plotFrequency(cats):
    
    fig, axes = plt.subplots(3, 2, figsize=(20,25))
    axes = axes.flatten()

    for ax, cat in zip(axes, cats):
        if cat == 'Survived':
            total = float(len(train[cat]))
        else:
            total = float(len(all_data[cat]))
        sns.countplot(all_data[cat], palette='plasma', ax=ax)
        
        
        for p in ax.patches:
            height = p.get_height()
            ax.text(p.get_x() + p.get_width() / 2.,
                    height + 10,
                    '{:1.2f}%'.format((height / total) * 100),
                    ha="center")

In [None]:
# DISTRIBUTIONS OF NUMERICAL FEATURES;

numerical_df = df.select_dtypes(include=['number'])

plt.figure(figsize=(20,15))

num_vars = len(numerical_df.columns)

for i, var in enumerate(numerical_df.columns, 1):
    plt.subplot((num_vars // 3) + 1, 3, i)
    sns.histplot(data=df, x=var, kde=True)
    plt.title(f'Distribution of {var}')
    
plt.tight_layout()
plt.show()

In [None]:
fig = plt.figure(figsize=(10,14), dpi=200)
for i, col in enumerate(df.columns[:-1]):
        plt.subplot(8,2,i+1)
        sns.kdeplot(df[col])
plt.tight_layout();

## Outlier Analysis

In [None]:
# 1) Checking Outliers on Numerical Features by the Target // whis=3

index = 0
plt.figure(figsize=(20,15))
for feature in df.select_dtypes(include=['number']).columns:
    if feature != "income":
        index += 1
        plt.subplot(3,3,index)
        sns.boxplot(x='income',y=feature,data=df, whis=3, palette='BuPu') 
plt.show()

In [None]:
# 2) BOXPLOT the OUTLIERS

# Initialize the subplot counter
x = 0

# Create a figure with specified size
plt.figure(figsize=(16, 4))

# Loop through each numerical column and create a boxplot
for col in df.select_dtypes(include=['number']).columns:
    x += 1
    plt.subplot(1, 8, x)
    sns.boxplot(data=df[col])
    plt.title(col)

# Show the plots
plt.tight_layout()  # Adjust subplots to fit in the figure area.
plt.show()


In [None]:
# 3) Function: BOXPLOT the OUTLIERS (her bir sutun icin ayri ve sutun icindeki valualari gosterir)

def plot_feature_outliers(df, hue_column):
    plt.figure(figsize=(20,30))
    for i, col in enumerate(df.columns[:-1], 1):
        plt.subplot(9, 2, i)
        plt.title(f"Distribution of {col} Data with Outliers")
        sns.boxplot(x=hue_column, y=col, data=df)
        plt.tight_layout()
    plt.show()
    
plot_feature_outliers(df, "class")

In [None]:
# Box plot for each column by target (texbox ile)

import plotly.express as px

features = df.columns[:-1]
for i in features:
    fig = px.box(df, x=i, y= 'class')
    fig.show()

In [None]:
# tek grafikte sadece sutunlari boxplot gosterir texbox ile
import cufflinks as cf  
cf.go_offline()
df.iloc[:,1:].iplot(kind="box")

In [None]:
# BOX Plot çizimi--> istedigin kadar boxplot ekelyebilirsin

plt.figure(figsize = (20,6))
plt.subplot(141)
sns.boxplot(y = "sepal_length", x = "labels", data = X, palette="BuPu")
plt.subplot(142)
sns.boxplot(y = "sepal_width", x = "labels", data = X, palette="BuPu")
plt.subplot(143)
sns.boxplot(y = "petal_length", x = "labels", data = X, palette="BuPu")
plt.subplot(144)
sns.boxplot(y = "petal_width", x = "labels", data = X, palette="BuPu")

In [None]:
numeric_columns = df.select_dtypes(include=['number'])

ncols = 3
num_plots = len(numeric_columns.columns)
nrows = (num_plots - 1) // ncols + 1

fig, axes = plt.subplots(nrows, ncols, figsize=(15, 5 * nrows))

for i, column in enumerate(numeric_columns.columns):
    row = i // ncols
    col = i % ncols
    sns.boxplot(data=numeric_columns, y=column, ax=axes[row, col])
    axes[row, col].set_title(f'Boxplot of {column}')

for i in range(num_plots, nrows * ncols):
    fig.delaxes(axes.flatten()[i])

plt.tight_layout()
plt.show()

In [None]:
# outlier deletion
df_num = df.select_dtypes(include='number')
for column in df_num.columns:
    for i in df["Credit_Score"].unique():
        selected_i = df[df["Credit_Score"] == i]
        selected_column = selected_i[column]
        
        std = selected_column.std()
        mean= selected_column.mean()
        
        max = mean + (4 * std)
        min =  mean - (4 * std)
        
        outliers = selected_column[((selected_i[column] > max) | (selected_i[column] < min))].index
        df.drop(index=outliers, inplace=True)
        print(column, i, outliers)

### Cleaning Outliers

In [None]:
#Cleaning Outliers 

from scipy import stats

# Calculate Z-Score for each numerical column
z_scores = np.abs(stats.zscore(df.select_dtypes(include=['number'])))

# Define a threshold (commonly 3)
threshold = 3

# Identify outliers
outliers = (z_scores > threshold).any(axis=1)

# Filter out the outliers
df_cleaned = df[~outliers]

# Update the all Dataset if needed
#df = df_cleaned.copy() 

# Display the shape of the dataframe before and after outlier removal
print("Original dataframe shape:", df.shape)
print("Dataframe shape after outlier removal:", df.shape)
 

# Display the first few rows of the cleaned dataframe
print(df.head())

## Skewness

In [None]:
# Calculate skewness for numeric features

# A skewness value greater than 1 indicates positive skewness,
# a skewness value less than -1 indicates negative skewness,
# and a skewness value close to zero indicates a relatively symmetric distribution.

num_cols= df.select_dtypes('number').columns

skew_limit = 0.75               # define a limit above which we will log transform
skew_vals = df[num_cols].skew()


# Showing the skewed columns
skew_cols = (skew_vals
             .sort_values(ascending=False)
             .to_frame()
             .rename(columns={0:'Skew'})
             .query('abs(Skew) > {}'.format(skew_limit)))
skew_cols

###  Log Transform

In [None]:
# Apply log transformation to skewed columns

for col in skew_cols.index:
    # Since log transformation cannot be applied to non-positive values, we add 1 to each value
    df[col] = np.log1p(df[col])

# Display the transformed dataframe
print(df.head())

## ANOVA Test for Feature Selection

In [None]:
# Perform ANOVA test for each categorical feature
anova_results = {}
categorical_features = df.select_dtypes(include=['object']).columns

for feature in categorical_features:
    groups = [df["co2_emissions"][df[feature] == category].values for category in df[feature].unique()]
    anova_results[feature] = stats.f_oneway(*groups)

# Display the ANOVA results
for feature, result in anova_results.items():
    print(f"ANOVA result for {feature}:")
    print(f"F-statistic: {result.statistic}, p-value: {result.pvalue}")
    print()

## Machine Learning

In [None]:
# User-Defined-Functions
#####################################################################################

# Function to Evaluate the Model Performans using Classification Confusion_matrix() 
# Also does the prediction in the function

def eval_metric(model, X_train, y_train, X_test, y_test, i):

    """ to get the metrics for the model """

    y_train_pred = model.predict(X_train)
    y_pred = model.predict(X_test)
    
    print(f"{i} Test_Set")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print()
    print(f"{i} Train_Set")
    print(confusion_matrix(y_train, y_train_pred))
    print(classification_report(y_train, y_train_pred))
    
#####################################################################################

# Function to display Feature Importance
def plot_feature_importance(model, X_train, figsize=(8, 5)):
    """
    Plots the feature importances of a fitted model as a horizontal bar plot,
    with the importance values displayed next to the bars.
    """
    # Get feature importances
    feature_importances = model.feature_importances_
    
    # Create a DataFrame for feature importances
    feats = pd.Series(data=feature_importances, index=X_train.columns).sort_values(ascending=False)
    
    # Plot the feature importances as a horizontal bar plot
    plt.figure(figsize=figsize)
    sns.barplot(y=feats.index, x=feats.values, orient='h', palette='Blues')

    # Add the importance values next to the bars
    for index, value in enumerate(feats.values):
        plt.text(value, index, f'{value:.2f}', va='center',fontsize=10)

    plt.title("Feature Importances")
    plt.xlabel("Importance")
    plt.ylabel("Features")
    plt.show()
#####################################################################################

# Function to display Feature Importance
def plot_feature_importance(model, X_train, figsize=(8, 5)):
    """
    Plots the feature importances of a fitted model as a horizontal bar plot.
    
    Returns:
    - A DataFrame of sorted feature importances.
    """
    # Get feature importances
    feature_importances = model.feature_importances_
    
    # Create a DataFrame for feature importances
    feats = pd.DataFrame(data=feature_importances, index=X_train.columns, columns=['importance'])
    feats = feats.sort_values("importance", ascending=False)
    
    # Plot the feature importances as a horizontal bar plot
    plt.figure(figsize=figsize)
    sns.barplot(data=feats, y=feats.index, x='importance', orient='h', palette='Blues')
    plt.title("Feature Importances")
    plt.show()
    
    return feats