In [1]:
def find_columns_with_less_than_x_categories(data,cat_cols,num):
    column_with_less_than_x_categories=[]
    for i in cat_cols:
        if len(data[i].unique())<=num:
            column_with_less_than_x_categories.append(i)
    return column_with_less_than_x_categories
    
    

In [2]:
def show_unique_values_of_columns_in_train_and_test_data(data,test_data,cols):
    for i in cols:
        print("Train Data")
        print(i,data[i].unique())
        print("Train Data Isna()")
        print(len(data[data[i].isna()]))
        print("--------------")
        print("Test Data")
        print(i,test_data[i].unique())
        print("Test Data Isna()")
        print(len(test_data[test_data[i].isna()]))
        print("--------------")
    

In [3]:
def show_missing_data_count(data):
    print( data.isnull().sum().sort_values(ascending=True))

#### Function: `replace_month`

This function replaces Turkish month names in a string with their English equivalents.

#### Parameters:
- **text**: A string potentially containing a month name.

#### Process:
1. Checks if the input is a string.
2. Uses regex to extract alphabetic substrings.
3. Replaces any found month name with its English equivalent from `month_map`.

#### Returns:
- The modified string with the replaced month name, or the original text if no match is found.


In [4]:
def replace_month(text):
    if isinstance(text, str):
        month = re.findall(r'[a-zA-Z]+', text)
        if month:
            month_name = month[0]
            if month_name in month_map:
                return text.replace(month_name, month_map[month_name])
    return text

#### Function: `replace_birth_dates`

This function converts a birth date string into the format **DD.MM.YYYY**.

#### Parameters:
- **text**: A string representing a birth date, which may include mixed content.

#### Process:
1. Attempts to parse the date using `fuzzy_with_tokens` to handle various formats and content.
2. If successful, it formats the date as **DD.MM.YYYY**.

#### Returns:
- The formatted date string if parsing is successful, or the original string if parsing fails.


In [5]:
def replace_birth_dates(text):
    try:
        # Parse the date and format it as DD-MM-YYYY
        dt, _ = parse(text, fuzzy_with_tokens=True)  # `fuzzy_with_tokens` allows handling mixed content
        return dt.strftime('%d.%m.%Y')
    except (ValueError, TypeError):
        # Return the original value if parsing fails
        return text

#### Function: `find_age`

This function calculates a person's age based on their birth date.

#### Parameters:
- **text**: A string representing the birth date in the format **DD.MM.YYYY**.

#### Process:
1. Checks if the input is missing (`NaN`). If so, returns `None`.
2. Converts the string to a date object.
3. Calculates the age by subtracting the birth year from the current year, adjusting for whether the birthday has occurred this year.
4. Validates the calculated age, returning `None` for ages less than 0 or greater than 100.

#### Returns:
- The calculated age as an integer, or `None` if the input is invalid or the calculated age is out of bounds.

#### Usage:
- This function is applied to the `col` column in the DataFrame to create a new column `Yas` containing the calculated ages.


In [6]:
def find_age(text):
    try:
        if pd.isna(text):
            return None
        text = datetime.strptime(text, '%d.%m.%Y').date()
        today = date.today()
        # Calculate age
        age = today.year - text.year - ((today.month, today.day) < (text.month, text.day))
        if age<0 or age>100 :  
            # print(f"Invalid date entry are calculated: {text}")
            return None
        return age
    except (ValueError, TypeError):
        # print(f"Invalid format or missing data: {text}")  # Log invalid formats
        return None




In [7]:
def display_bar_plot_for_categorical_cols(data,cols):
    total_categories = len(cols)
    
    # Her satırda kaç grafik gösterileceği
    n_cols = 4
    n_rows = (total_categories + n_cols - 1) // n_cols  # Gerekli satır sayısını hesapla
    
    # Subplots oluştur
    fig, axs = plt.subplots(n_rows, n_cols, figsize=(20, n_rows * 5))
    
    # Grafikleri oluştur
    for i, col in enumerate(cols):
        ax = axs[i // n_cols, i % n_cols]  # Satır ve sütun indekslerini hesapla
        sns.barplot(x=data[col].value_counts().index, y=data[col].value_counts(), ax=ax, width=0.3)
        ax.set_title(col)
        ax.tick_params(axis='x', rotation=45)  # X ekseni etiketlerini döndür
    
    # Kalan boş alt grafik alanlarını kaldır
    for j in range(i + 1, n_rows * n_cols):
        fig.delaxes(axs.flatten()[j])
    
    plt.tight_layout()
    plt.show()


#### 1. `match_cities_in_train_data_with_test_data`
This function standardizes and matches city names between two datasets: `train_data` and `test_data`.

- **Step 1**: Corrects spelling mistakes in city names (like Istanbul and Izmir) using the `replace_izmir_istanbul_cities_with_correct_spelling` function.
- **Step 2**: Removes non-alphabetic characters (like '-', '/', ',') from `train_data` and matches cleaned names with `test_data` using `remove_non_alpha_characters_from_data_and_match_with_test_data`.
- **Step 3**: Maps Istanbul districts to "İstanbul" using the `map_istanbul_districts` function.
- **Step 4**: Matches similar city names between `train_data` and `test_data` using fuzzy matching in the `match_strings_with_its_similars` function.

##### 2. `replace_izmir_istanbul_cities_with_correct_spelling`
Corrects spelling variations of major city names (like 'Istanbul' and 'Izmir') by replacing them with their standardized Turkish forms ('İstanbul', 'İzmir').

##### 3. `remove_non_alpha_characters_from_data_and_match_with_test_data`
Removes non-alphabetic characters (such as `-`, `/`, `,`) from city names in `train_data`. Then, it splits the city names and matches each part with values in `test_data`.

##### 4. `map_istanbul_districts`
Maps Istanbul districts in `train_data` to the city name "İstanbul". It checks whether a string is one of the districts and replaces it accordingly.

##### 5. `match_strings_with_its_similars`
Uses fuzzy matching to find the closest match for each city name in `train_data` from `test_data`. If no close match is found (i.e., similarity score below 50), the string is replaced with "Diğer" (Other). Otherwise, the closest match is assigned to `train_data`.


In [8]:
def match_cities_in_train_data_with_test_data(train_data,test_data,col):
    train_data=map_other(train_data,col)
    train_data = replace_izmir_istanbul_cities_with_correct_spelling(train_data, col)
    train_data=remove_non_alpha_characters_from_data_and_match_with_test_data(train_data,test_data,col)
    train_data[col]=train_data[col].apply(map_istanbul_districts)
    matched_texts,train_data=match_strings_with_its_similars(train_data,col,test_data[col].unique().tolist())
    return train_data

In [9]:
def map_other(data,col):
    other_map={
        'Diğer': 'Diger',
        '------': 'Diger',
         'Diger': 'Diger',
        'Yok':'Diger'
    }
    data[col] = data[col].replace(other_map)
    return data

#### Function: `replace_izmir_istanbul_cities_with_correct_spelling`

This function standardizes city names in the specified column of the dataset. 

1. **Mapping**: Uses a dictionary to replace 'Istanbul' with 'İstanbul' and 'Izmir' with 'İzmir'.
2. **Specific Replacements**: Checks for any occurrences of 'Istanbul' or 'Izmir' in strings and updates them accordingly.

The function ensures consistent naming for major cities.


In [10]:
def replace_izmir_istanbul_cities_with_correct_spelling(data, col):
    big_cities_map = {
        'Istanbul': 'İstanbul',
        'Izmir': 'İzmir',
        'İzmi̇r':'İzmir',
    }
    # Replace values using the dictionary
    data[col] = data[col].replace(big_cities_map)

    # Apply more specific replacements
    data[col] = data[col].apply(lambda x: 'İstanbul' if isinstance(x, str) and 'Istanbul' in x else x)
    data[col] = data[col].apply(lambda x: 'İzmir' if isinstance(x, str) and 'Izmir' in x else x)
    data[col] = data[col].apply(lambda x: 'İzmir' if isinstance(x, str) and 'İzmi̇r' in x else x)

    return data

#### Function: `remove_non_alpha_characters_from_data_and_match_with_test_data`

This function cleans the specified column in the training dataset by removing non-alphabetic characters and aligning the data with the test dataset. 

1. **Mask Creation**: Identifies entries containing non-alphabetic characters.
2. **Data Splitting**: Splits these entries into individual components.
3. **Matching**: Updates matched values from the test dataset.
4. **Return**: Returns the cleaned training dataset.

The function ensures consistency between the training and test datasets.


In [11]:
def remove_non_alpha_characters_from_data_and_match_with_test_data(train_data,test_data,col):
    mask = train_data[col].apply(lambda x: isinstance(x, str) and any(char in x for char in ['-', '/', ',', '.', ' ']))
    unique_test_values = set(test_data[col].unique())
    for idx in train_data[mask].index:
        splitted_data = re.split(r'[ \-/,.]', train_data.at[idx, col])
        # print(splitted_data)
        for dt in splitted_data:
            if dt in unique_test_values:
                train_data.loc[idx,col] = dt
                break
               
    return train_data


#### `map_istanbul_districts` Function

This function checks if a given string corresponds to a district in Istanbul, and if it does, it maps the string to "İstanbul". Otherwise, it returns the original text.

#### Parameters:
- **`text`**: A string representing a location name.

#### Logic:
- The function verifies if the input `text` is a string.
- It compares the input `text` with a predefined list of **Istanbul district names** (`istanbul_districts`).
- If the input matches any district name in the list, the function returns `"İstanbul"`.
- If no match is found, it returns the original `text`.

In [12]:
istanbul_districts = [
    "Adalar", "Arnavutköy", "Ataşehir", "Avcılar", "Bağcılar", "Bahçelievler", 
    "Bakırköy", "Başakşehir", "Bayrampaşa", "Beşiktaş", "Beykoz", "Beylikdüzü", 
    "Beyoğlu", "Büyükçekmece", "Çatalca", "Çekmeköy", "Esenler", "Esenyurt", 
    "Eyüpsultan", "Fatih", "Gaziosmanpaşa", "Güngören", "Kadıköy", "Kağıthane", 
    "Kartal", "Küçükçekmece", "Maltepe", "Pendik", "Sancaktepe", "Sarıyer", 
    "Şile", "Silivri", "Şişli", "Sultanbeyli", "Sultangazi", "Tuzla", "Ümraniye", 
    "Üsküdar", "Zeytinburnu"
]

In [13]:
def map_istanbul_districts(text):
    if isinstance(text,str):
        if text in istanbul_districts:
            # print(text)
            return 'İstanbul'
        else:
            return text

#### Function: `match_strings_with_its_similars`

This function matches each string in a specified column with its most similar reference text, using fuzzy string matching.

#### Parameters:
- **data**: The DataFrame containing the column to be processed.
- **col**: The column name in the DataFrame where the strings are located.
- **reference_texts**: A list of strings to match against (e.g., standardized names).
- **min**: The minimum similarity score required for a match.
- **maxx**: The maximum similarity score allowed.

#### Process:
1. Loops through each entry in the `col` column.
2. For each string, it uses fuzzy matching (`fuzz.token_sort_ratio`) to find the top 5 similar strings from `reference_texts`.
3. Selects the best match from the top 5. If the similarity score is below 50, the string is replaced with "Diğer".
4. Records the best matches in `matched_texts` for reference and updates the column with the matched string.

#### Returns:
- A list of matched texts along with the updated DataFrame.

#### Usage:
- This function is useful for normalizing text data by matching similar but slightly varied entries to a common reference text.


In [14]:
def match_strings_with_its_similars(data,col,reference_texts):
    # Limiting to top 5 matches
    mapped_texts = []
    matched_texts=[]
    reference_texts = reference_texts
    for i in range(len(data[col].values)):
        dep=data[col][i]
        if isinstance(dep, str):
            # Extract the top 5 closest matches
            matches = process.extract(dep, reference_texts, scorer=fuzz.token_sort_ratio, limit=5)            
            # Choose the best match from the top 5
            best_match = max(matches, key=lambda x: x[1]) 
            if best_match[1]<50:
                data.loc[i, col]= 'Diger'
            else:
                m_c=[dep,best_match[:2]]
                if m_c not in matched_texts:
                    matched_texts.append(m_c)                
                data.loc[i, col]= best_match[0]
    return matched_texts,data            

#### Function: `call_replace_methods`

This function orchestrates a series of string replacement operations on a specified column of a dataset. It integrates multiple replacement methods to standardize data entries.

#### Parameters:
- **data**: The dataset (e.g., DataFrame) that contains the column to be modified.
- **col**: The name of the column in which string replacements will occur.
- **different_pairs**: A list of tuples containing pairs of values for replacement where each tuple consists of an `old_value` (to be replaced) and a `new_value` (replacement).
- **same_pairs**: A list of values that should remain unchanged or be standardized.

#### Process:
1. **Different Substring Replacement**: 
   - Calls `replace_string_with_different_substring` to replace specified `old_value` substrings with `new_value` in the column.
   
2. **Same Substring Replacement**:
   - Calls `replace_string_with_same_substring` to replace specific substrings with identical values.
   
3. **Other Value Handling**:
   - Calls `replace_string_with_other` to change any remaining values in the column that do not match the specified `same_pairs` to "Diger,".
#### Returns:
- The modified dataset after applying all replacement methods.


In [15]:
def call_replace_methods(data,col,different_pairs,same_pairs):
    data=replace_string_with_different_substring(data,col,different_pairs)
    data=replace_string_with_same_substring(data,col,same_pairs)
    data=replace_string_with_other(data, col, same_pairs)
    return data
    

In [16]:
def replace_string_with_different_substring(data, col, different_pairs):
    for old_value, new_value in different_pairs:
        data[col] = data[col].apply(lambda x: new_value if isinstance(x, str) and old_value in x else x)
    return data

In [17]:
def replace_string_with_same_substring(data,col,same_pairs):
    for sub in same_pairs:        
        data[col] =data[col].apply(lambda x:sub if isinstance(x, str) and  sub in x else x)
    return data    

In [18]:
def replace_string_with_other(data, col, same_pairs):
    data[col] = data[col].apply(lambda x: "Diger" if isinstance(x, str) and x=="------" or x not in same_pairs else x)
    return data

#### Function `replace_high_school_departments_with_correct_versions` 

This function standardizes high school department names by replacing them with their correct Turkish versions.

#### Parameters:
- **`data`**: The DataFrame that contains the data.
- **`col`**: The column in the DataFrame where department names are stored.

#### Logic:
1. **Initial Mapping**: 
   - A dictionary named `high_school_department` maps abbreviated or incorrect department names (e.g., `'Ea'`, `'Mf'`) to their full Turkish counterparts (e.g., `'Eşit Ağırlık'`, `'Sayısal'`).
   - The `.replace()` method applies these mappings directly to the column.

2. **Further Specific Replacements**:
   - If a department name contains `'Sosyal'`, it is replaced with `'Sözel'`.
   - If it contains `'Türkçe'`, it is replaced with `'Eşit Ağırlık'`.
   - If it contains `'Fen'`, it is replaced with `'Sayısal'`.

#### Example:
- Input: `"Ea"` → Output: `"Eşit Ağırlık"`
- Input: `"Sosyal Bilimler"` → Output: `"Sözel"`


In [19]:
def replace_high_school_departments_with_correct_versions(data, col):
    high_school_department={
        'Ea':'Eşit Ağırlık',
        'Tm':'Eşit Ağırlık',
        'Ts': 'Sözel',
        'Mf': 'Sayısal',
        'Fm':'Sayısal',
        'Yabancı Di̇l':'Dil',
         'Ingilizce': 'Dil',
        
    }
    data[col]=data[col].replace(high_school_department)

    # Apply more specific replacements
    data[col] = data[col].apply(lambda x: 'Sözel' if isinstance(x, str) and 'Sosyal' in x else x)
    data[col] = data[col].apply(lambda x: 'Eşit Ağırlık' if isinstance(x, str) and  'Türkçe' in x else x)
    data[col] = data[col].apply(lambda x: 'Sayısal' if isinstance(x, str) and 'Fen' in x else x)

    return data

In [20]:
def find_cities_that_contain_non_alpha_characters(data,col):
    non_alpha_cities=set()
    train_data[col].apply(lambda x: non_alpha_cities.add(x) if isinstance(x, str) and any(char in x for char in ['-', '/', ',', '.', ' ']) else x)
    non_alpha_cities=list(non_alpha_cities)
    return non_alpha_cities

In [21]:
def check_categorical_columns(train_data,test_data,cols):
    for i in cols:
        print(i)    
        print(train_data[i].unique())
        print(test_data[i].unique())
        