# Exploratory Data Analysis

In [1]:
def show_object_data(data):
    print(data.select_dtypes(include=['object']).info())

In [2]:
def show_missing_data_count(data):
    print( data.isnull().sum().sort_values(ascending=True))

In [3]:
def fill_missing_value_in_columns_with_mode_value(data, col):
    data[col]=data[col].fillna(data[col].mode()[0])


In [4]:
def lower_string_in_column(data,col):
    data[col]=data[col].str.lower()

In [5]:
def drop_columns_from_train_test(train_data, test_data, col):
    train_data = train_data.drop(columns=[col])
    test_data = test_data.drop(columns=[col])
    return train_data, test_data


# EDA for Numerical Columns

In [6]:
def show_numeric_columns_in_hist_and_boxplot(data,num_cols):
    for col in num_cols:
        if col=='id':
            continue
        plt.figure(figsize = (8, 5))
        plt.subplot(1, 2, 1)
        plt.title(col)
        train_data[col].hist( bins=25, grid=False, color='#86bf91', zorder=5, rwidth=0.8)
        plt.ylabel('Count')
        plt.subplot(1, 2, 2)
        sns.boxplot(x=train_data[col],color='#86bf91')
        plt.show() 


# EDA for Categorical Columns

In [7]:
def lower_strings_and_fill_missing_values_with_mode_value(data,col):
    data[col]=data[col].str.lower()
    data[col]=data[col].fillna(data[col].mode()[0])

In [8]:
def count_non_alpha_strings_in_column(data,col):
    count=0
    for col in data[col]:
        if  not col.isalpha():
            #print(col)
            count+=1
    return count
    

In [9]:
def map_cleaned_data_into_column(data, col):        
    cleaned_data_map = {}    
    non_alpha_words = find_non_alpha_strings_in_column(data, col)
    print(f"Found {len(non_alpha_words)} non-alpha entries.")

    for item in non_alpha_words:
        # Fazla boşlukları kaldır
        strip_item = re.sub(r'\s+', ' ', item.strip())
        
        # Non-alpha karakterlerle ikiye böl ("/" veya "-")
        split_data = re.split(r'[/-]', strip_item)
        
        # Temizlenmiş veriyi listeye ekle
        cleaned_data = [part.strip() for part in split_data]
        
        # Her bir parçayı kontrol et, Türkiye şehri mi?
        for city in cleaned_data:
            if city.lower() in turkey_cities:
                cleaned_data_map[item] = city  # Orijinal veri ve Türkiye şehri
                break  # İlk Türkiye şehri bulunduğunda dur
    
    # Sonucu kontrol et
    # for key, value in cleaned_data_map.items():
    #     print(f"Orijinal: {key} --> Türkiye Şehri: {value}")

    return cleaned_data_map

In [10]:
# Non-alpha karakter bulucu fonksiyon
def find_non_alpha_strings_in_column(data, col):
    words_that_are_non_alpha = []
    for entry in data[col]:
        if isinstance(entry, str) and not entry.isalpha():  # Entry is not purely alphabetical
            if entry not in words_that_are_non_alpha:  # Unique kontrolü
                words_that_are_non_alpha.append(entry)
    return words_that_are_non_alpha

In [11]:
def clean_lise_bolumu(deger):
    deger = deger.lower()  # Küçük harfe çevir
    if any(keyword in deger for keyword in ['sayısal', 'mf', 'fen', 'matematik']):
        return 'sayısal'
    elif any(keyword in deger for keyword in ['eşit', 'tm']):
        return 'eşit ağırlık'
    elif any(keyword in deger for keyword in ['sözel', 'ts']):
        return 'sözel'
    elif any(keyword in deger for keyword in ['yabancı', 'dil']):
        return 'dil'
    else:
        return np.nan

In [12]:
def replace_scholarships(value, replace_dict):
    if pd.isna(value):  # Check for NaN values directly
        return np.nan
    for key, variations in replace_dict.items():
        if value in variations:
            return key
    return value

In [13]:
def clean_amount(amount):
    if isinstance(amount, str):
        amount = amount.lower().replace('tl', '').replace('₺', '').replace(' ', '').replace(',', '.')
        match = re.match(r'(\d+\.?\d*)', amount)
        if match:
            return float(match.group(1))
    return None



In [14]:
def categorize_amount(amount):
    if pd.isna(amount):
        return '-'
    elif amount < 500:
        return '0 - 499₺'
    elif amount < 1000:
        return '500₺ - 999₺'
    else:
        return '1000₺ ve üstü'

In [15]:
def show_all_one_hot_encoding_categories_for_train_and_test(one_hot_encoding_columns):
    for col in one_hot_encoding_columns:
        print(col)
        print(train_data[col].unique())
        print(test_data[col].unique())

In [16]:
def frequency_encoding(data, columns):
    for column in columns:
        freq = data[column].value_counts()
        data[column] = data[column].map(freq)
    return data

## Check Data Columns

In [17]:
def check_column_in_train(train_data,col):
    print("Train Data Uniqueness")
    print(train_data[col].unique())
    print("--------------------")
    print("Train Data Isna()")
    print(len(train_data[train_data[col].isna()]))

In [18]:
def check_columns_in_train(train_data,col):
    for i in col:
        print("Train Data Uniqueness")
        print(train_data[i].unique())
        print("--------------------")
        print("Train Data Isna()")
        print(len(train_data[train_data[i].isna()]))

In [19]:
def check_column_in_train_test(train_data,test_data,col):
    print("Train Data Uniqueness")
    print(train_data[col].unique())
    print("--------------------")
    print("Test Data Uniqueness")
    print(test_data[col].unique())
    print("---------------------")
    print("---------------------")
    print("Train Data Isna()")
    print(len(train_data[train_data[col].isna()]))
    print("---------------------")
    print("Test Data Isna()")
    print(len(test_data[test_data[col].isna()]))


## Plotting Graphs

In [20]:
def plot_categorical_columns_in_bar_plots(data,col):
    plt.figure(figsize=(4, 3))
    sns.countplot(x=col, data=data, palette='viridis',legend=False,hue=col)
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.xticks(rotation=45)  
    plt.show()

In [21]:
def plot_categorical_columns_that_have_more_than_thousand_unique_values(data,col):
    data_count=pd.value_counts(data[col])    
    top_n=20
    top_values=data_count[:top_n]
    plt.figure(figsize=(7, 4))
    sns.barplot(x=top_values.values, y=top_values.index, palette='viridis')
    
    # Add labels
    plt.title(f'Top {top_n} Categories in Categorical Column')
    plt.xlabel('Frequency')
    plt.ylabel(col)
    plt.show()