# 1. Business Understanding

Business valuations are crucial for a diverse range of stakeholders, guiding capital allocation decisions based on precise assessments of companies' economic performance, regardless of whether the trends are positive, steady, or negative. The potential risks of both overly negative valuations, misinterpreting positive trends, and overlooking negative developments are equally significant. Such misjudgments can impede a company's refinancing options, lead to missed investment prospects for investors, and result in financial losses. In the following code, we evaluate how aggregated features from LinkedIn help to improve the quality of prediction of a default. Three data frames are used - financial metrics only, LinkedIn metrics only and both combined. The evaluation is considered successful if a positive influence of the LinkedIn features on the prediction can be determined. AUC and recall are considered particularly relevant metrics. Details can be found in the Data chapter of the corresponding master thesis.

# 2. Load data and prepare libaries

With the use of Chat GPD, comments have been added for readability.

## 2.1 Import libaries

In [None]:
import os
import re
import numpy as np
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor
from prettytable import PrettyTable
from imblearn.over_sampling import ADASYN
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
import xgboost as xgb
from xgboost import plot_importance, plot_tree, XGBClassifier, XGBRegressor
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
from fuzzywuzzy import fuzz
import warnings
warnings.filterwarnings("ignore")
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, recall_score
from sklearn.metrics import precision_score

## 2.2 Load datasets

In [None]:
dateipfad = r'C:\Users\chiar\OneDrive\Masterthesis\Modell\List of companys_onetemplate.xls'
df_up = pd.read_excel(dateipfad)
df_up.head()

In [None]:
dateipfad = r'C:\Users\chiar\OneDrive\Masterthesis\Modell\df_waf_final.csv'
df_waf_rfm = pd.read_csv(dateipfad, sep=';')
df_waf_rfm.head()

In [None]:
df_waf_rfm.shape

Author knowledge: in the generation of df_waf_rfm initialisation values were used. They are droped from the dataframe before the merge.

In [None]:
# Filtere die Zeilen mit dem Wert 1 in der Spalte "Number of Employee 2014"
filtered_df = df_waf_rfm[df_waf_rfm['Number of employees 2014'] == 1]
filtered_df

In [None]:
filtered_df.shape

In [None]:
df_waf_rfm = df_waf_rfm[df_waf_rfm['Number of employees 2014'] != 1]

In [None]:
dateipfad = r'C:\Users\chiar\OneDrive\Masterthesis\Modell\Matching.csv'
df_match = pd.read_csv(dateipfad, sep=';')
df_match.head()

## 2.3 Merge Datasets

Cleaning Company name to make a match with firm_original_name possible.

In [None]:
# Copy the column "Company Name" to the new column "Copy_Company_Name" in DataFrame df_up
df_up['Copy_Company_Name'] = df_up['Company Name']

# Remove values in parentheses from the "Company Name" column in DataFrame df_up
df_up['Company Name'] = df_up['Company Name'].apply(lambda x: re.sub(r'\(.*\)', '', str(x)).strip())

# Print the DataFrame df_up after the modifications
df_up

In [None]:
# Function for cleaning Company Name by removing non-alphanumeric characters
def clean_company_name(name):
    return re.sub(r'[^\w\s]', '', str(name))

# Clean the Company Name column using the clean_company_name function in DataFrame df_up
df_up['Company Name'] = df_up['Company Name'].apply(clean_company_name)
df_up

In [None]:
# Function for fuzzy matching to find the best match for each company name
def find_best_match(company_name, reference_names):
    best_match = None
    best_similarity = 0

    for ref_name in reference_names:
        similarity = fuzz.token_set_ratio(company_name, ref_name)
        if similarity > best_similarity:
            best_match = ref_name
            best_similarity = similarity

    # Threshold for similarity score (adjust as needed)
    threshold_similarity = 95

    # Return the best match if similarity score is above the threshold, else return None
    return best_match if best_similarity >= threshold_similarity else None

# Create an empty list to store the matched companies
matched_companies = []

# Extract company names from df_up
company_names_up = df_up['Company Name'].tolist()

# Extract firm_original_names from df_waf_rfm
firm_original_names_waf = df_waf_rfm['Firm_original_name'].tolist()

# Iterate over the company names in df_up
for company_name_up in company_names_up:
    # Find the best match for the current company name in df_up within df_waf_rfm
    best_match_waf = find_best_match(company_name_up, firm_original_names_waf)
    
    # Append the match result to the matched_companies list
    matched_companies.append((company_name_up, best_match_waf))

# Convert the matched_companies list to a DataFrame
results_matching = pd.DataFrame(matched_companies, columns=['Company Name Up', 'Best Match in df_waf_rfm'])

# Display the results
results_matching


In [None]:
# Count the number of entries where the Best Match is None in results_matching
num_none_matches = results_matching['Best Match in df_waf_rfm'].isna().sum()

# Display the result
print("Number of entries with 'None' in Best Match:", num_none_matches)

In [None]:
# Step 1: Merge df_up with df_waf_rfm using the results_matching as the merge key
df_up_merged = pd.merge(df_up, results_matching, left_on='Company Name', right_on='Company Name Up', how='left')

# Step 2 and 3: Iterate over the Company Names in df_up and search in results_matching
for index_up, row_up in df_up.iterrows():
    company_name_up = row_up['Company Name']
    
    # Step 4: Check if the Company Name in results_matching is None
    best_match_waf = results_matching.loc[results_matching['Company Name Up'] == company_name_up, 'Best Match in df_waf_rfm'].values[0]
    if pd.isna(best_match_waf):
        # Step 5: If None is found, fill None in the previously added columns from df_waf_rfm
        df_up_merged.loc[index_up, df_waf_rfm.columns] = None
    else:
        # Step 6: If a match is found, extract the row from df_waf_rfm and merge the entries to df_up_merged
        row_waf = df_waf_rfm.loc[df_waf_rfm['Firm_original_name'] == best_match_waf]
        df_up_merged.loc[index_up, df_waf_rfm.columns] = row_waf.values[0]

df_up_merged

In [None]:
# Count the number of entries where the Number of employees 2014 is NaN in df_up_merged
num_nan_employees = df_up_merged['Number of employees 2014'].isna().sum()

# Display the result
print("Number of entries with NaN in Number of employees 2014:", num_nan_employees)

In [None]:
# Drop rows with NaN in the "Number of employees 2014" column in df_up_merged
df_up_merged.dropna(subset=['Number of employees 2014'], inplace=True)

# Reset the index after dropping rows
df_up_merged.reset_index(drop=True, inplace=True)
df_up_merged.head(2)

Matching was successful in df_up_merged.

In [None]:
# List of values searched
gesuchte_werte = ['Hovnanian', 'Community Health Systems', 'Denbury Inc', 'WESTMORELAND COAL CO',
                  'ICONIX BRAND GROUP INC', 'NORTHERN OIL & GAS INC', 'SEARS HOLDINGS CORP',
                  'PARKER DRILLING CO', 'PG&E CORP', 'CLOUD PEAK ENERGY INC', 'PHI INC',
                  'BRISTOW GROUP INC', 'WEATHERFORD INTL PLC', 'ALTA MESA RESOURCES INC',
                  'CHESAPEAKE ENERGY CORP', 'EP ENERGY CORP', 'RITE AID CORP',
                  'DESTINATION MATERNITY CORP', 'DEAN FOODS CO', 'MALLINCKRODT PLC',
                  'FRONTIER COMMUNIC PARENT INC', 'LSC COMMUNICATIONS INC',
                  'DIAMOND OFFSHRE DRILLING INC', 'ENVISION HEALTHCARE CORP', 'UNIT CORP',
                  'TUESDAY MORNING CORP', 'CSI COMPRESSCO LP', 'FERRELLGAS PARTNERS -LP',
                  'W&T OFFSHORE INC', 'TUPPERWARE BRANDS CORP', 'SEADRILL LTD',
                  'GLOBAL EAGLE ENTERTAINMENT', 'FORUM ENERGY TECH INC', 'TRANSOCEAN LTD',
                  'TOWN SPORTS INTL HOLDINGS', 'SUMMIT MIDSTREAM PARTNERS LP',
                  'GULFPORT ENERGY CORP', 'NABORS INDUSTRIES LTD', 'PACIFIC DRILLING SA',
                  'CALLON PETROLEUM CO/DE']

# Check if the values in the column "Company Name" are included
gesuchte_werte_in_df = df_up_merged[df_up_merged['Company Name'].isin(gesuchte_werte)]

# Print
gesuchte_werte_in_df


# 3. Data Preperation

During data preperation, the data are first examined in general (3.1). Then missing values (3.1.1), duplicates (3.1.2), non-numerical columns (3.1.4) and the distribution of the target variable are checked (3.1.5). Due to the data type, data outliers can only be checked downstream. Therefore, an initial data cleaning is carried out in 3.2. In the course of this, columns that are not needed are removed (3.2.1), the data type is corrected (3.2.2), the column country (3.2.3) and industry (3.2.4) are cleaned. On this basis, the data outliers can be examined in 3.3.1. Subsequently, the content-related data distribution is checked (3.3.2, 3.3.3). The final data cleaning is carried out in chapter 3.4. Values that are not to be taken into account are removed (3.4.1, 3.4.2), encoding takes place where necessary (3.4.3), empty values are treated (3.4.4). Finally, collinarity and multicollinarity are checked (3.5).

## 3.1 Data Inspection

In [None]:
df_up_merged.head(10)

The following column do not add value to the context and are therefor not needed:
- Adress
- S&P Entity ID
- Excel Company ID
- Index Constituents [Secondary Listings]
- S&P Entity Credit Rating - Issuer Credit Rating - Local Currency LT [Latest] (Rating)
- &P Entity Credit Rating Date - Issuer Credit Rating - Local Currency LT [Latest] (Rating Date)
- S&P Entity Credit Rating - Issuer Credit Rating - Foreign Currency LT [Latest] (Rating)
- S&P Entity Credit Rating Date - Issuer Credit Rating - Foreign Currency LT [Latest] (Rating Date)
- S&P Entity Credit Rating - Issuer Credit Rating - Local Currency LT [Latest] (CreditWatch)
- S&P Entity Credit Rating Date - Issuer Credit Rating - Foreign Currency LT [Latest] (Rating Date)	
- S&P Entity Credit Rating - Issuer Credit Rating - Local Currency LT [Latest] (CreditWatch)
- S&P Entity Credit Rating Date - Issuer Credit Rating - Local Currency LT [Latest] (CreditWatch Date)
- S&P Entity Credit Rating - Issuer Credit Rating - Local Currency LT [Latest] (Outlook)
- S&P Entity Credit Rating Date - Issuer Credit Rating - Foreign Currency LT [Latest] (Outlook Date)
- the author decided to focus on the timeseries 2014-2018. Therefore the values for 2013 and >2018 can be deleted.

In [None]:
df_up_merged.describe()

In [None]:
df_up_merged.info()

In [None]:
# Determining the size and scope of the data set
print('The dataset has {} rows and {} columns. This results in {} data entries.'.format(df_up_merged.shape[0],df_up_merged.shape[1], df_up_merged.size)) 

In [None]:
# Analyze the data types of columns in df_up_merged
column_data_types = df_up_merged.dtypes

# Set the option to display all rows and columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Display the result
print("Data Types of Columns in df_up_merged:")
print(column_data_types.to_string())

Apart from the first 11 columns, the other entries are numbers. These must be converted into float values.

### 3.1.1 Checking for missing values

In [None]:
# Set the option to display all columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Check for null values in df_up_merged
null_counts = df_up_merged.isnull().sum()

# Display the result
print("Number of null values in each column of df_up_merged:")
print(null_counts)

The following columns contain highest Number of None/NaN fields that need to be cleaned:
- Gross Profit/ Employee 2018
- All Rating and changes in Rating
- Consider dropping companys that have missing values in financials.

Columns are included that are no longer needed and contain some empty values. These are: 
- Rating 2018 ALT (author knows that the column offers no professional added value)
- Rating 2012
- Change 2012/2013 (does not concern analysis period)
- New joining work experience 2014 (empty)
- Number of Notices 2018
- Number of notices 2018
- Number of New Joiners 2014 (authors knowledge)

### 3.1.2 Checking for dublicates

In [None]:
duplicates = df_up_merged[df_up_merged.duplicated()]
print("Duplicate Rows : ",len(duplicates))
duplicates

As expected there are no dublicates in this dataframe. No cleaning nessercary.

### 3.1.3 Checking for data outliers

The checking for data outliers is done later in this notebook. Most columns needs to be converted to a processable formate for numbers.

### 3.1.4 Inspecting non-numerical columns 

In [None]:
df_up_merged['Geographic Region'].head()

In [None]:
unique_region_values = df_up_merged['Geographic Region'].unique()
unique_region_values

In [None]:
df_up_merged['Land'].head()

In [None]:
unique_country_values = df_up_merged['Land'].unique()
unique_country_values

Correlation between Geografic Region an Country expected. Geographic Region contains less information and should be droped if needed. Values in country needs to be cleaned since there are the same letter in capital and small letters.

In [None]:
df_up_merged['Exchange'].head()

In [None]:
unique_exchange_values = df_up_merged['Exchange'].unique()
unique_exchange_values

INFO: 
- OM: Nasdaq OMX Nordic, a stock exchange in Sweden, Denmark, Finland, and Iceland;
- SWX: SIX Swiss Exchange, the Swiss stock exchange;
- NYSE: New York Stock Exchange, the stock exchange in New York City, USA;
- ENXTPA: Euronext Paris, the French stock exchange;
- NasdaqGS: Nasdaq Global Select Market, a US-based stock exchange, part of the Nasdaq Stock Market;
- XTRA: Frankfurt Stock Exchange, the stock exchange in Frankfurt, Germany;
- ENXTAM: Euronext Amsterdam, the Dutch stock exchange;
- BME: Bolsas y Mercados Españoles, the stock exchange in Spain;
- LSE: London Stock Exchange, the stock exchange in London, United Kingdom;
- ENXTBR: Euronext Brussels, the stock exchange in Belgium;
- BIT: Borsa Italiana, the stock exchange in Italy;
- ISE: Irish Stock Exchange, the stock exchange in Ireland;
- CPSE: Euronext Lisbon, the stock exchange in Portugal;
- WBAG: Wiener Börse AG, the stock exchange in Austria;
- OB: Oslo Børs, the stock exchange in Norway;
- HLSE: Helsinki Stock Exchange, the stock exchange in Finland.

In [None]:
#Counting the number of Tickers. They act as unique identifier per company and should be kept. 
unique_Ticker_count = df_up_merged['Ticker'].nunique()
unique_Ticker_count

No cleaning of column ticker needed from a subject specific point of view.

In [None]:
unique_industry_values = df_up_merged['S&P RatingsDirect® Industry'].unique()
unique_industry_values

Cleaning tasks: "Corporates; Industrials" is a pre configuration and can be droped. The main industry following in the breakdown is the intresting one and needs to be keept. All the other information are considered details and should be droped. Also rename the column to "Industry".

In [None]:
# List of columns to check for unique values
columns_to_check = ['Rating 2014', 'Rating 2015', 'Rating 2016', 'Rating 2017', 'Rating 2018', 'Rating 2019']

for column in columns_to_check:
    # Get the unique values in the specified column
    unique_values = df_up_merged[column].unique()

    # Print the unique values for the current column
    print("Unique values for " + column + ":")
    print(unique_values)

Variables need to be converted to kategorial features to use them in futher analysis.

###  3.1.5 Checking for target variable

In [None]:
# List of columns to check for unique values
columns_to_check = ["Default"]

for column in columns_to_check:
    # Get the unique values in the specified column
    unique_values = df_up_merged[column].unique()

    # Print the unique values for the current column
    print("Unique values for " + column + ":")
    print(unique_values)

In [None]:
No_default_count = df_up_merged["Default"].value_counts()[0]
default_count = df_up_merged["Default"].value_counts()[1]

print("Number of healthy companies:", No_default_count)
print("Number of default dataset:", default_count)

Poor database of default - common in this field.

## 3.2 First data cleansing to enable deeper Data inspection

In [None]:
#Copy for better work contorl
df_up = df_up_merged.copy()

### 3.2.1 Droping columns that are not needed or empty

In [None]:
# List of columns to remove
columns_to_remove = [
    "Rating 2018 ALT",
    "Rating 2012",
    "Change 2012/13",
    "New joining work experience 2014",
    "Migrating work experience 2018",
    "Number of Notices 2018",
    "Number of notices 2018", 
    "Number of New Joiners 2014",
    'Adress',
    'S&P Entity ID',
    'Excel Company ID',
    'Index Constituents [Secondary Listings]',
    'Index Constituents [Primary Listing]',
    'S&P Entity Credit Rating - Issuer Credit Rating - Local Currency LT [Latest] (Rating)',
    'S&P Entity Credit Rating Date - Issuer Credit Rating - Local Currency LT [Latest] (Rating Date)',
    'S&P Entity Credit Rating - Issuer Credit Rating - Foreign Currency LT [Latest] (Rating)',
    'S&P Entity Credit Rating Date - Issuer Credit Rating - Foreign Currency LT [Latest] (Rating Date)',
    'S&P Entity Credit Rating - Issuer Credit Rating - Local Currency LT [Latest] (CreditWatch)',
    'S&P Entity Credit Rating Date - Issuer Credit Rating - Foreign Currency LT [Latest] (Rating Date)',
    'S&P Entity Credit Rating - Issuer Credit Rating - Local Currency LT [Latest] (CreditWatch)',
    'S&P Entity Credit Rating Date - Issuer Credit Rating - Local Currency LT [Latest] (CreditWatch Date)',
    'S&P Entity Credit Rating - Issuer Credit Rating - Local Currency LT [Latest] (Outlook)',
    'S&P Entity Credit Rating Date - Issuer Credit Rating - Foreign Currency LT [Latest] (Outlook Date)',
    "Market Capitalization [12/31/2013] (€EURmm, Historical rate)",
    "Market Capitalization [My Setting] [12/31/2019] (€EURmm, Historical rate)",
    "Market Capitalization [My Setting] [12/31/2020] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2013] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2019] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2020] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2013] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2019] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2020] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2013] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2019] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2020] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2013] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2019] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2020] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2013] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2019] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2020] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2013] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2019] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2020] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2013] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2019] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2020] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2013] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2019] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2020] (€EURmm, Historical rate)",
    "Total Employees - Capital IQ [CY 2013]",
    "Total Employees - Capital IQ [CY 2019]",
    "Total Employees - Capital IQ [CY 2020]",
    "Cash from Ops. - Capital IQ [CY 2013] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2019] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2020] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2013] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2019] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2020] (€EURmm, Historical rate)",
    "Copy_Company_Name", "Company Name Up", "Best Match in df_waf_rfm", "Firm_original_name",
    "Rating 2020",
    "Rating 2021",
    "Rating 2022",
    "Rating 2023",
    "Change 2019/20",
    "Change 2020/21",
    "Change 2021/22",
    "Change 2022/23"
]

# Drop the specified columns from the DataFrame
df_up.drop(columns=columns_to_remove, inplace=True)
df_up.head(2)

### 3.2.2 Converting columns from object to float. 

First there is a need to check for special characters (spaces, etc.)

In [None]:
def check_for_special_characters(df, columns_to_check):
    pattern = re.compile(r'[^\w\s.]')  # define a pattern for special characters (everything except letters, numbers, spaces and full stops)
    result = []

    for column in columns_to_check:
        for index, value in df[column].items():
            if re.search(pattern, str(value)):
                result.append((index, column, value))

    if result:
        print("Folgende Sonderzeichen wurden gefunden:")
        for row in result:
            print(f"Row {row[0]}, Column {row[1]}, Value: {row[2]}")
    else:
        print("Keine Sonderzeichen in den angegebenen Spalten gefunden.")


columns_to_check = [
    "Market Capitalization [12/31/2014] (€EURmm, Historical rate)",
    "Market Capitalization [12/31/2015] (€EURmm, Historical rate)",
    "Market Capitalization [12/31/2016] (€EURmm, Historical rate)",  
    "Market Capitalization [My Setting] [12/31/2017] (€EURmm, Historical rate)",
    "Market Capitalization [My Setting] [12/31/2018] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Employees - Capital IQ [CY 2014]",
    "Total Employees - Capital IQ [CY 2015]",
    "Total Employees - Capital IQ [CY 2016]",
    "Total Employees - Capital IQ [CY 2017]",
    "Total Employees - Capital IQ [CY 2018]",
    "Cash from Ops. - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Employee development 2015",
    "Employee development 2016",
    "Employee development 2017",
    "Employee development 2018",
    "Migrating work experience 2014",
    "Migrating work experience 2015",
    "Migrating work experience 2016",
    "Migrating work experience 2017",
    "New joining work experience 2015",
    "New joining work experience 2016",
    "New joining work experience 2017",
    "New joining work experience 2018",
    "Fluctuation rate 2014",
    "Fluctuation rate 2015",
    "Fluctuation rate 2016",
    "Fluctuation rate 2017",
    "Fluctuation rate 2018",
    "More than once/different position",]

check_for_special_characters(df_up, columns_to_check)


There are negative numbers, kommas and also empty fields indicated by -. This charackters needs to be cleaned. 
Next it must be taken into account whether whole numbers are present or if we decimal numbers.

In [None]:
# Check for integers in columns
def check_for_integers(df, columns_to_check):
    integer_columns = []
    for column in columns_to_check:
        is_integer = df[column].apply(lambda x: str(x).isdigit()).all()
        if is_integer:
            integer_columns.append(column)
    return integer_columns

columns_to_convert = [   ]  
integer_columns = check_for_integers(df_up, columns_to_convert)

if integer_columns:
    print("The following columns contain integers:")
    print(integer_columns)
else:
    print("No columns with only integers were found.")

Columns can be converted to float, since dataset only contains dicomal numbers. Last the decimal separator is checked.

In [None]:
columns_to_convert = [     
    "Market Capitalization [12/31/2014] (€EURmm, Historical rate)",
    "Market Capitalization [12/31/2015] (€EURmm, Historical rate)",
    "Market Capitalization [12/31/2016] (€EURmm, Historical rate)",  
    "Market Capitalization [My Setting] [12/31/2017] (€EURmm, Historical rate)",
    "Market Capitalization [My Setting] [12/31/2018] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Employees - Capital IQ [CY 2014]",
    "Total Employees - Capital IQ [CY 2015]",
    "Total Employees - Capital IQ [CY 2016]",
    "Total Employees - Capital IQ [CY 2017]",
    "Total Employees - Capital IQ [CY 2018]",
    "Cash from Ops. - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Employee development 2015",
    "Employee development 2016",
    "Employee development 2017",
    "Employee development 2018",
    "Migrating work experience 2014",
    "Migrating work experience 2015",
    "Migrating work experience 2016",
    "Migrating work experience 2017",
    "New joining work experience 2015",
    "New joining work experience 2016",
    "New joining work experience 2017",
    "New joining work experience 2018",
    "Fluctuation rate 2014",
    "Fluctuation rate 2015",
    "Fluctuation rate 2016",
    "Fluctuation rate 2017",
    "Fluctuation rate 2018",
    "More than once/different position",]  

def check_comma_or_dot(df, columns):
    comma_columns = []
    dot_columns = []

    for column in columns:
        if df[column].str.contains(',').any():
            comma_columns.append(column)
        elif df[column].str.contains('.').any():
            dot_columns.append(column)

    return comma_columns, dot_columns

comma_columns, dot_columns = check_comma_or_dot(df_up, columns_to_convert)

print("Spalten mit Komma:")
print(comma_columns)

print("Spalten mit Punkt:")
print(dot_columns)

To convert successful equal decimal seperators needs to be used. Therefor kommas are replaced by points.

In [None]:
columns_to_convert = ["Employee development 2015",
    "Employee development 2016",
    "Employee development 2017",
    "Employee development 2018",
    "Migrating work experience 2014",
    "Migrating work experience 2015",
    "Migrating work experience 2016",
    "Migrating work experience 2017",
    "New joining work experience 2015",
    "New joining work experience 2016",
    "New joining work experience 2017",
    "New joining work experience 2018",
    "Fluctuation rate 2014",
    "Fluctuation rate 2015",
    "Fluctuation rate 2016",
    "Fluctuation rate 2017",
    "Fluctuation rate 2018",
    "More than once/different position",]

# Replace commas with dots in the relevant columns
for column in columns_to_convert:
    df_up[column] = df_up[column].str.replace(',', '.')

# Print
df_up.head(5)

The - accounting fo an empty value are converted to NaN. 

In [None]:
# In some columns there are - as empty values. Those need to be replaced bevor we can convert to float.
columns_to_convert = [
    "Market Capitalization [12/31/2014] (€EURmm, Historical rate)",
    "Market Capitalization [12/31/2015] (€EURmm, Historical rate)",
    "Market Capitalization [12/31/2016] (€EURmm, Historical rate)",  
    "Market Capitalization [My Setting] [12/31/2017] (€EURmm, Historical rate)",
    "Market Capitalization [My Setting] [12/31/2018] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Employees - Capital IQ [CY 2014]",
    "Total Employees - Capital IQ [CY 2015]",
    "Total Employees - Capital IQ [CY 2016]",
    "Total Employees - Capital IQ [CY 2017]",
    "Total Employees - Capital IQ [CY 2018]",
    "Cash from Ops. - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2018] (€EURmm, Historical rate)",
]

# Replace the "-" character with NaN (Not-a-Number) in the relevant columns
for column in columns_to_convert:
    df_up[column] = df_up[column].replace('-', float('nan'))

Lastly the columns can be converted to float.

In [None]:
# List of columns to convert to float and replace "object" values with NaN
columns_to_convert = [
    "Market Capitalization [12/31/2014] (€EURmm, Historical rate)",
    "Market Capitalization [12/31/2015] (€EURmm, Historical rate)",
    "Market Capitalization [12/31/2016] (€EURmm, Historical rate)",  
    "Market Capitalization [My Setting] [12/31/2017] (€EURmm, Historical rate)",
    "Market Capitalization [My Setting] [12/31/2018] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Employees - Capital IQ [CY 2014]",
    "Total Employees - Capital IQ [CY 2015]",
    "Total Employees - Capital IQ [CY 2016]",
    "Total Employees - Capital IQ [CY 2017]",
    "Total Employees - Capital IQ [CY 2018]",
    "Cash from Ops. - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Employee development 2015",
    "Employee development 2016",
    "Employee development 2017",
    "Employee development 2018",
    "Migrating work experience 2014",
    "Migrating work experience 2015",
    "Migrating work experience 2016",
    "Migrating work experience 2017",
    "New joining work experience 2015",
    "New joining work experience 2016",
    "New joining work experience 2017",
    "New joining work experience 2018",
    "Fluctuation rate 2014",
    "Fluctuation rate 2015",
    "Fluctuation rate 2016",
    "Fluctuation rate 2017",
    "Fluctuation rate 2018",
    "More than once/different position",
]

def convert_to_float_with_negatives(value):
    try:
        # Attempts to convert the value to a float
        return float(value)
    except ValueError:
        # If the value cannot be converted into a float (e.g. if there is a minus sign in front of a number), return the value unchanged
        return value

# Convert the columns to the data type "float" and keep the negative values
for column in columns_to_convert:
    df_up[column] = df_up[column].apply(convert_to_float_with_negatives)

# Print
df_up.head(5)


Check if converting was successful:

In [None]:
# Analyze the data types of columns in df_up_merged
column_data_types = df_up.dtypes

# Set the option to display all rows and columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Display the result
print("Data Types of Columns in df_up:")
print(column_data_types.to_string())

### 3.2.3 Cleaning column country

In [None]:
# Using only capital letters
df_up["Land"] = df_up["Land"].str.upper()

# checking unique values
unique_land_values = df_up["Land"].unique()
unique_land_values

In [None]:
# Using only capital letters
df_up["Land"] = df_up["Land"].str.upper()

# Convert the 'Land' column to string data type
df_up["Land"] = df_up["Land"].astype(str)

# checking unique values
unique_land_values = df_up["Land"].unique()
unique_land_values


### 3.2.4 Unify values in Industies

In [None]:
df_up.rename(columns={"S&P RatingsDirect® Industry": "Industry"}, inplace=True)
df_up.head(1)

In [None]:
# Step 1: Remove "Corporates; Industrials;" from the entries in the "Industry" column
df_up['Industry'] = df_up['Industry'].str.replace('Corporates; Industrials;', '', regex=False)

# Step 2: Remove all words after the first semicolon in the "Industry" column
df_up['Industry'] = df_up['Industry'].str.split(';').str[0]

# Display unique values in the "Industry" column
unique_industries = df_up['Industry'].unique()
unique_industries

In [None]:
# Copy for better work control / df_pp = pre proccesed
df_pp = df_up.copy()

### 3.3.1 Checking for data outliers

Note: Data outliers are checked in gruops to confirm, that there are no obvious errors in the data. Due to the nature of the domain it is not absolutly nessercary to clean the outliers - espacially since the source of the financials is Bloomberg, wich accounts as a reliable source.

In [None]:
selected_columns = [
    "Market Capitalization [12/31/2014] (€EURmm, Historical rate)",
    "Market Capitalization [12/31/2015] (€EURmm, Historical rate)",
    "Market Capitalization [12/31/2016] (€EURmm, Historical rate)",
    "Market Capitalization [My Setting] [12/31/2017] (€EURmm, Historical rate)",
    "Market Capitalization [My Setting] [12/31/2018] (€EURmm, Historical rate)",
]

plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

df_pp[selected_columns].boxplot()
plt.title("Market Capitalization Boxplots")
plt.ylabel("Market Capitalization (€EURmm)")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

In [None]:
selected_columns = [
    "EBITDA - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2018] (€EURmm, Historical rate)",
]

plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

df_pp[selected_columns].boxplot()
plt.title("EBITDA Boxplots")
plt.ylabel("Market Capitalization (€EURmm)")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

In [None]:
selected_columns = [
    "EBIT - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2018] (€EURmm, Historical rate)",
]

plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

df_pp[selected_columns].boxplot()
plt.title("EBIT Boxplots")
plt.ylabel("Market Capitalization (€EURmm)")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

In [None]:
selected_columns = [
    "Net Income - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2018] (€EURmm, Historical rate)",
]

plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

df_pp[selected_columns].boxplot()
plt.title("Net Income Boxplots")
plt.ylabel("Market Capitalization (€EURmm)")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

In [None]:
selected_columns = [
    "Total Equity - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2018] (€EURmm, Historical rate)",
]

plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

df_pp[selected_columns].boxplot()
plt.title("Total Equity Boxplots")
plt.ylabel("Market Capitalization (€EURmm)")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

In [None]:
selected_columns = [
	"Total Debt - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2018] (€EURmm, Historical rate)",
]

plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

df_pp[selected_columns].boxplot()
plt.title("Total Debt Boxplots")
plt.ylabel("Market Capitalization (€EURmm)")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

In [None]:
selected_columns = [
    "Total Assets - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2018] (€EURmm, Historical rate)",
]

plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

df_pp[selected_columns].boxplot()
plt.title("Total Assets Boxplots")
plt.ylabel("Market Capitalization (€EURmm)")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

In [None]:
selected_columns = [
    "Net Debt - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2018] (€EURmm, Historical rate)",
]

plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

df_pp[selected_columns].boxplot()
plt.title("Total Debt Boxplots")
plt.ylabel("Market Capitalization (€EURmm)")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

In [None]:
selected_columns = [
    "Gross Profit - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2018] (€EURmm, Historical rate)",
]

plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

df_pp[selected_columns].boxplot()
plt.title("Gross Profit Boxplots")
plt.ylabel("Market Capitalization (€EURmm)")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

In [None]:
selected_columns = [
    "Total Employees - Capital IQ [CY 2014]",
    "Total Employees - Capital IQ [CY 2015]",
    "Total Employees - Capital IQ [CY 2016]",
    "Total Employees - Capital IQ [CY 2017]",
    "Total Employees - Capital IQ [CY 2018]",
]

plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

df_pp[selected_columns].boxplot()
plt.title("Total Employees Boxplots")
plt.ylabel("Market Capitalization (€EURmm)")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

In [None]:
selected_columns = [
    "Cash from Ops. - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2018] (€EURmm, Historical rate)",
]

plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

df_pp[selected_columns].boxplot()
plt.title("Cash from Ops Boxplots")
plt.ylabel("Market Capitalization (€EURmm)")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

In [None]:
selected_columns = [
    "Total Revenue - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2018] (€EURmm, Historical rate)",
]

plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

df_pp[selected_columns].boxplot()
plt.title("Total Revenue Boxplots")
plt.ylabel("Market Capitalization (€EURmm)")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

In [None]:
selected_columns = [
 "Employee development 2015",
    "Employee development 2016",
    "Employee development 2017",
    "Employee development 2018",
]

plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

df_pp[selected_columns].boxplot()
plt.title("Employee development Boxplots")
plt.ylabel("Market Capitalization (€EURmm)")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

In [None]:
selected_columns = [
  "Migrating work experience 2014",
    "Migrating work experience 2015",
    "Migrating work experience 2016",
    "Migrating work experience 2017",
]

plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

df_pp[selected_columns].boxplot()
plt.title("Migrating work experience Boxplots")
plt.ylabel("Market Capitalization (€EURmm)")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

In [None]:
selected_columns = [
    "New joining work experience 2015",
    "New joining work experience 2016",
    "New joining work experience 2017",
    "New joining work experience 2018",
]

plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

df_pp[selected_columns].boxplot()
plt.title("New joining work experience Boxplots")
plt.ylabel("Market Capitalization (€EURmm)")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

In [None]:
selected_columns = [
  "Fluctuation rate 2014",
    "Fluctuation rate 2015",
    "Fluctuation rate 2016",
    "Fluctuation rate 2017",
    "Fluctuation rate 2018"
]

plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

df_pp[selected_columns].boxplot()
plt.title("Fluctuation rate Boxplots")
plt.ylabel("Market Capitalization (€EURmm)")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

Form a statistical point of view I would use at least a 98 % quantil.From a professional point of view most data outliers make sense...

In [None]:
selected_columns = [
"More than once/different position"
]

plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

df_pp[selected_columns].boxplot()
plt.title("Serveral positions Boxplots")
plt.ylabel("Number of people who worked there in more than one position")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

### 3.3.2 Data allocation with respect to the target variable

In [None]:
# Check how the entries are distributed among the properties of the target variable. 
df_pp['Default'].value_counts()

In [None]:
# Calculate the value counts of the target variable
value_counts = df_pp['Default'].value_counts()

# Extract the values and labels for the pie chart
labels = ['1', '0']
values = [value_counts.get(1, 0), value_counts.get(0, 0)]

# Define the explode parameter for the pie chart
explode = [0.1, 0]

# Plot the pie chart
plt.pie(values, labels=labels, autopct='%1.2f%%', explode=explode)
plt.axis('equal')  # Equal aspect ratio ensures that the pie is drawn as a circle.
plt.title('Distribution of Change_2018/19_down grade')
plt.show()


After Split in test & train data the train data should be oversampled using the SMOTE technique.

### 3.3.3 Checking distribution in dataset

In [None]:
# Count the occurrences of each country in the 'Land' column
country_counts = df_pp['Land'].value_counts()

# Create a pie chart to visualize the distribution
plt.figure(figsize=(8, 8))
plt.pie(country_counts, labels=country_counts.index, autopct='%1.1f%%')
plt.title('Distribution of Companies by Country')
plt.show()

In [None]:
unique_land_values = df_pp['Land'].nunique()
print("Unique values:", unique_land_values)

Most companies are from the USA. Second biggest group is GB, followed by Switzerland.

In [None]:
# Count the occurrences of each country in the 'Land' column
country_counts = df_pp['Industry'].value_counts()

# Create a DataFrame to store the counts and percentage
country_distribution = pd.DataFrame({'Industry': country_counts.index, 'Count': country_counts.values})

# Calculate the percentage of each country in the 'Land' column
total_countries = len(df_pp['Industry'])
country_distribution['Percentage'] = (country_distribution['Count'] / total_countries) * 100

# Sort the DataFrame by count in descending order
country_distribution = country_distribution.sort_values(by='Count', ascending=False)

# Display the tabular view of the distribution
print(country_distribution)

Insurances should not be contained and need to be removed.

In [None]:
# Function to calculate percentages for different years
def calculate_percentage(row, year):
    total_employees = row["Total Employees - Capital IQ [CY %d]" % year]
    employees = row["Number of employees %d" % year]
    
    # Calculate percentage if not NaN and total employees is not zero
    percentage = (employees / total_employees) * 100 if (not pd.isna(total_employees) and total_employees != 0) else None
    
    return percentage

# List of years to calculate percentages for
years = [2014, 2015, 2016, 2017, 2018]

# Calculate percentages for each year and apply the function to the DataFrame rows
for year in years:
    col_name = "Percentage of employees on Linkedin %d" % year
    df_pp[col_name] = df_pp.apply(calculate_percentage, axis=1, args=(year,))

# Displaying the results
output_df = df_pp[["Company Name"] + ["Percentage of employees on Linkedin %d" % year for year in years]]
output_df


In [None]:
# Define the bins for percentage ranges
bins = [0, 5, 10, 15, 20, float('inf')]  # The last bin represents 20% or more

# Define labels for the bins
labels = ['<5%', '5-10%', '10-15%', '15-20%', '20%+']

# Create a new column with bins
output_df['Percentage Range 2017'] = pd.cut(output_df['Percentage of employees on Linkedin 2017'], bins=bins, labels=labels, right=False)

# Count the occurrences in each bin
percentage_counts = output_df['Percentage Range 2017'].value_counts()

# Display the result
percentage_counts

In [None]:
# Define the bins for percentage ranges
bins = [0, 5, 10, 15, 20, float('inf')]  # The last bin represents 20% or more

# Define labels for the bins
labels = ['<5%', '5-10%', '10-15%', '15-20%', '20%+']

# Create a new column with bins
output_df['Percentage Range 2018'] = pd.cut(output_df['Percentage of employees on Linkedin 2018'], bins=bins, labels=labels, right=False)

# Count the occurrences in each bin
percentage_counts = output_df['Percentage Range 2018'].value_counts()

# Display the result
percentage_counts

In [None]:
# Filter the rows in "df_pp" where the value in "Change_2018/19_down grade" column is 1
filtered_rows = df_pp[df_pp['Default'] == 1]

# Extract the "Company Name" from the filtered rows
company_names = filtered_rows['Company Name']

# Filter and display the corresponding rows in "output_df" based on the "Company Name" values
result_df = output_df[output_df['Company Name'].isin(company_names)]
result_df

In [None]:
# Define the bins for percentage ranges
bins = [0, 5, 10, 15, 20, float('inf')]  # The last bin represents 20% or more

# Define labels for the bins
labels = ['<5%', '5-10%', '10-15%', '15-20%', '20%+']

# Create a new column with bins
result_df['Percentage Range 2018'] = pd.cut(result_df['Percentage of employees on Linkedin 2018'], bins=bins, labels=labels, right=False)

# Count the occurrences in each bin
percentage_counts = result_df['Percentage Range 2018'].value_counts()

# Display the result
percentage_counts

In [None]:
columns_to_remove = ['Percentage of employees on Linkedin 2018', 'Percentage of employees on Linkedin 2014', 'Percentage of employees on Linkedin 2015', 'Percentage of employees on Linkedin 2016', 'Percentage of employees on Linkedin 2017']
df_pp = df_pp.drop(columns_to_remove, axis=1)
df_pp

Defaultet companys show a rather low percentage of employees on LinkedIn. Distribution doesnt change over the year.

Majority of the companies has a percentage below 5%. Take into account, that there are no employee numbers for around 60-70 companies, wich results in 0. The reduction can be explained by the general reduction in the data in 2018. The LinkedIn dataset was probably retrieved during 2018.

## 3.4 Final data cleansing

In [None]:
df_pp2 = df_pp.copy()

### 3.4.1 Cleaning data outliers

This feature counts the numberr of people who have worked in diffrent position in the company. Even though 120.000 might be realistic in bigger firms, it is cleaned here.

In [None]:
# Assuming df_pp2 is your DataFrame
quantile_98 = df_pp2['More than once/different position'].quantile(0.98)

# Filter the DataFrame to keep only values up to the 98% quantile
df_pp2 = df_pp2[df_pp2['More than once/different position'] <= quantile_98]

In [None]:
selected_columns = [
"More than once/different position"
]

plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

df_pp2[selected_columns].boxplot()
plt.title("Serveral positions Boxplots")
plt.ylabel("Number of people who worked there in more than one position")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

### 3.4.2 Removing Industry Insurance

In [None]:
# Assuming df_pp2 is your DataFrame
df_pp2 = df_pp2[df_pp2['Industry'] != 'Insurance']

In [None]:
# Count the occurrences of each country in the 'Land' column
country_counts = df_pp2['Industry'].value_counts()

# Create a DataFrame to store the counts and percentage
country_distribution = pd.DataFrame({'Industry': country_counts.index, 'Count': country_counts.values})

# Calculate the percentage of each country in the 'Land' column
total_countries = len(df_pp2['Industry'])
country_distribution['Percentage'] = (country_distribution['Count'] / total_countries) * 100

# Sort the DataFrame by count in descending order
country_distribution = country_distribution.sort_values(by='Count', ascending=False)

# Display the tabular view of the distribution
print(country_distribution)

Removal of Incurance successful. 

In [None]:
print(df_pp2['Industry'].dtypes)
#covnert to 'str' (String)
df_pp2['Industry'] = df_pp2['Industry'].astype(str)
print(df_pp2['Industry'].dtypes)


Checking effect on target variable:

In [None]:
# Check how the entries are distributed among the properties of the target variable. 
df_pp['Default'].value_counts()

### 3.4.3 Putting Rating in kategorial values 

The ratings need to be put in kategorial variables to be useful in futher analysis. Integer encoding can be used. The rating contains a score that is reflected in the ascending values. To simplify and uniy the values a aggregated skala is being used:
AAA 1
AA 1
A 2
BBB 3
BB 4
B 5
CCC 6
CC 6
C 6
D 6

In [None]:
# Iteriere über die Spalten und gib die eindeutigen Werte aus
for year in range(2013, 2019):
    column_name = f'Rating {year}'
    unique_values = df_pp2[column_name].unique()
    print(f'Unique values in {column_name}: {unique_values}')

In [None]:
# Integer Encoding
# Create a dictionary to map the original values to the categorical variables
rating_mapping = {
    "A-": 1,
    "A": 2,
    "A+": 3,
    "AA": 4,
    "AA-": 5,
    "AA+": 6,
    "B": 7,
    "B+": 8,
    "BB": 9,
    "BB-": 10,
    "BB+": 11,
    "BBB": 12,
    "BBB-": 13,
    "BBB+": 14,
    "CCC": 15,
    # If NaN is present in the data, it will be mapped to 0 as per your requirement
    # You may skip this line if there are no NaN values in the columns
    pd.NA: 0
}

# Loop through the years and convert the values in each "Rating" column to categorical variables
for year in range(2013, 2020):
    column_name = f'Rating {year}'
    df_pp2[column_name] = df_pp2[column_name].replace(rating_mapping)

# Display the updated DataFrame
df_pp2


### 3.4.4 Handling empty entries

In [None]:
# Set the option to display all columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Check for null values in df_up_merged
null_counts = df_pp2.isnull().sum()

# Display the result
print("Number of null values in each column of df_up_merged:")
print(null_counts)

The following adjustments are made: 
- Change <year>: Set "no change"
- Financials: median of the column
- Total Employees <year>: use following year or mean
- Gross Profit/ Employee 2018: drop

In [None]:
columns_with_missing_values = [
    "Market Capitalization [12/31/2014] (€EURmm, Historical rate)",
    "Market Capitalization [12/31/2015] (€EURmm, Historical rate)",
    "Market Capitalization [12/31/2016] (€EURmm, Historical rate)",
    "Market Capitalization [My Setting] [12/31/2017] (€EURmm, Historical rate)",
    "Market Capitalization [My Setting] [12/31/2018] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Equity ratio 2018",
    "Debt ratio (in Prozent) 2018",
    "Debt-equity ratio 2018",  
    "Return on equity 2018",
    "Return on sales 2018",
]

# Iterate over the selected columns
for col in columns_with_missing_values:
    # Identify rows with missing values (NaN or empty)
    missing_values_mask = df_pp2[col].isnull() | (df_pp2[col] == '')

    # Calculate the median value of the column excluding the missing values
    median_value = df_pp2.loc[~missing_values_mask, col].median()

    # Replace the missing values with the median value
    df_pp2.loc[missing_values_mask, col] = median_value

Filling the Total Employyes - either with future value or with median.

In [None]:
# Überprüfe, ob die Spalte "Total Employees - Capital IQ [CY 2015]" gefüllt ist
df_pp2['Total Employees - Capital IQ [CY 2014]'] = df_pp2['Total Employees - Capital IQ [CY 2014]'].fillna(df_pp2['Total Employees - Capital IQ [CY 2015]'])

# Überprüfe, ob die Spalte "Total Employees - Capital IQ [CY 2016]" gefüllt ist
df_pp2['Total Employees - Capital IQ [CY 2014]'] = df_pp2['Total Employees - Capital IQ [CY 2014]'].fillna(df_pp2['Total Employees - Capital IQ [CY 2016]'])

# Überprüfe, ob die Spalte "Total Employees - Capital IQ [CY 2017]" gefüllt ist
df_pp2['Total Employees - Capital IQ [CY 2014]'] = df_pp2['Total Employees - Capital IQ [CY 2014]'].fillna(df_pp2['Total Employees - Capital IQ [CY 2017]'])

# Überprüfe, ob die Spalte "Total Employees - Capital IQ [CY 2018]" gefüllt ist
df_pp2['Total Employees - Capital IQ [CY 2014]'] = df_pp2['Total Employees - Capital IQ [CY 2014]'].fillna(df_pp2['Total Employees - Capital IQ [CY 2018]'])

# Berechne den Median der Spalte "Total Employees - Capital IQ [CY 2014]"
median_employees_2014 = df_pp2['Total Employees - Capital IQ [CY 2014]'].median()

# Fülle die verbleibenden fehlenden Werte mit dem Median der Spalte "Total Employees - Capital IQ [CY 2014]"
df_pp2['Total Employees - Capital IQ [CY 2014]'] = df_pp2['Total Employees - Capital IQ [CY 2014]'].fillna(median_employees_2014)

In [None]:
# Überprüfe, ob die Spalte "Total Employees - Capital IQ [CY 2015]" gefüllt ist
df_pp2['Total Employees - Capital IQ [CY 2015]'] = df_pp2['Total Employees - Capital IQ [CY 2015]'].fillna(df_pp2['Total Employees - Capital IQ [CY 2016]'])

# Überprüfe, ob die Spalte "Total Employees - Capital IQ [CY 2016]" gefüllt ist
df_pp2['Total Employees - Capital IQ [CY 2015]'] = df_pp2['Total Employees - Capital IQ [CY 2015]'].fillna(df_pp2['Total Employees - Capital IQ [CY 2017]'])

# Überprüfe, ob die Spalte "Total Employees - Capital IQ [CY 2018]" gefüllt ist
df_pp2['Total Employees - Capital IQ [CY 2015]'] = df_pp2['Total Employees - Capital IQ [CY 2015]'].fillna(df_pp2['Total Employees - Capital IQ [CY 2018]'])

# Berechne den Median der Spalte "Total Employees - Capital IQ [CY 2014]"
median_employees_2015 = df_pp2['Total Employees - Capital IQ [CY 2015]'].median()

# Fülle die verbleibenden fehlenden Werte mit dem Median der Spalte "Total Employees - Capital IQ [CY 2014]"
df_pp2['Total Employees - Capital IQ [CY 2015]'] = df_pp2['Total Employees - Capital IQ [CY 2015]'].fillna(median_employees_2015)


In [None]:
# Überprüfe, ob die Spalte "Total Employees - Capital IQ [CY 2015]" gefüllt ist
df_pp2['Total Employees - Capital IQ [CY 2016]'] = df_pp2['Total Employees - Capital IQ [CY 2016]'].fillna(df_pp2['Total Employees - Capital IQ [CY 2017]'])

# Überprüfe, ob die Spalte "Total Employees - Capital IQ [CY 2016]" gefüllt ist
df_pp2['Total Employees - Capital IQ [CY 2016]'] = df_pp2['Total Employees - Capital IQ [CY 2016]'].fillna(df_pp2['Total Employees - Capital IQ [CY 2018]'])

# Berechne den Median der Spalte "Total Employees - Capital IQ [CY 2014]"
median_employees_2016 = df_pp2['Total Employees - Capital IQ [CY 2016]'].median()

# Fülle die verbleibenden fehlenden Werte mit dem Median der Spalte "Total Employees - Capital IQ [CY 2014]"
df_pp2['Total Employees - Capital IQ [CY 2016]'] = df_pp2['Total Employees - Capital IQ [CY 2016]'].fillna(median_employees_2016)


In [None]:
# Überprüfe, ob die Spalte "Total Employees - Capital IQ [CY 2015]" gefüllt ist
df_pp2['Total Employees - Capital IQ [CY 2017]'] = df_pp2['Total Employees - Capital IQ [CY 2017]'].fillna(df_pp2['Total Employees - Capital IQ [CY 2018]'])

# Berechne den Median der Spalte "Total Employees - Capital IQ [CY 2014]"
median_employees_2017 = df_pp2['Total Employees - Capital IQ [CY 2017]'].median()

# Fülle die verbleibenden fehlenden Werte mit dem Median der Spalte "Total Employees - Capital IQ [CY 2014]"
df_pp2['Total Employees - Capital IQ [CY 2017]'] = df_pp2['Total Employees - Capital IQ [CY 2017]'].fillna(median_employees_2017)


In [None]:
# Check if "Total Employees - Capital IQ [CY 2018]" is filled
if df_pp2['Total Employees - Capital IQ [CY 2018]'].notnull().any():
    # Fill missing values in "Total Employees - Capital IQ [CY 2018]" with values from "Total Employees - Capital IQ [CY 2017]"
    df_pp2['Total Employees - Capital IQ [CY 2018]'].fillna(df_pp2['Total Employees - Capital IQ [CY 2017]'], inplace=True)

Droping Gross Profit / Employee 2018:

In [None]:
# Drop the column "Gross Profit/ Employee 2018" from df_pp2
df_pp2.drop("Gross Profit/ Employee 2018", axis=1, inplace=True)

No change is beeing set as a value for the missing change indivators:

In [None]:
# List of columns to check and fill with "no change"
columns_to_fill_with_no_change = [
    'Change 2013/14',
    'Change 2014/15',
    'Change 2015/16',
    'Change 2016/17',
    'Change 2017/18'
]

# Fill the NaN values in the specified columns with "no change"
df_pp2[columns_to_fill_with_no_change] = df_pp2[columns_to_fill_with_no_change].fillna("no change")

Check if all missing fields are eliminated:

In [None]:
# Set the option to display all columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Check for null values in df_up_merged
null_counts = df_pp2.isnull().sum()

# Display the result
print("Number of null values in each column of df_up_merged:")
print(null_counts)

Cleaning of missing values successful.

In [None]:
#Counting number of downgrades in "Change_2018/19_down grade"
count_of_ones = df_pp2['Default'].value_counts().get(1, 0)
print("Number of defaults", count_of_ones)

### 3.4.5 Label encoding, of the remaining columns with strings

In [None]:
df_pp3 = df_pp2.copy()

In [None]:
# First drop rows that contain similar information
columns_to_remove = ['Exchange', 'Ticker', 'Geographic Region']

# Remove the specified columns
df_pp3.drop(columns=columns_to_remove, axis=1, inplace=True)

In [None]:
# Replace '-' with NaN to properly detect null values
df_pp3.replace('-', float('nan'), inplace=True)

# Calculate the most frequent value in the 'Industry' column
most_frequent_value = df_pp3['Industry'].mode().iloc[0]

# Impute the missing values with the most frequent value
df_pp3['Industry'].fillna(most_frequent_value, inplace=True)

In [None]:
# Initialize the LabelBinarizer
label_binarizer = LabelBinarizer()

# Apply Binary-Encoding to the 'Land' column
binary_encoded_data = label_binarizer.fit_transform(df_pp3['Industry'])
binary_encoded_cols = [f"Industry{val}" for val in label_binarizer.classes_]
binary_encoded_df = pd.DataFrame(binary_encoded_data, columns=binary_encoded_cols)

# Drop the original 'Land' column from df_pp2
df_pp3.drop('Industry', axis=1, inplace=True)

# Concatenate binary_encoded_df with df_pp2
df_pp3 = pd.concat([df_pp3, binary_encoded_df], axis=1)

# Display the updated DataFrame with binary-encoded 'Land' column
df_pp3.head(2)


In [None]:
# Rename the 'Land' column to 'Country'
df_pp3.rename(columns={'Land': 'Country'}, inplace=True)

In [None]:
# Column to be binary encoded
column_to_encode = 'Country'

# Convert the column to strings
df_pp3[column_to_encode] = df_pp3[column_to_encode].astype(str)

# Initialize the LabelBinarizer
label_binarizer = LabelBinarizer()

# Apply Binary-Encoding to the selected column
binary_encoded_data = label_binarizer.fit_transform(df_pp3[column_to_encode])
binary_encoded_df = pd.DataFrame(binary_encoded_data, columns=[f"{column_to_encode}_{val}" for val in label_binarizer.classes_])

# Concatenate the binary-encoded columns to the original DataFrame
df_pp3 = pd.concat([df_pp3, binary_encoded_df], axis=1)

# Drop the original column 'Land' from the DataFrame
df_pp3.drop(column_to_encode, axis=1, inplace=True)

In [None]:
df_pp3.drop("Country_nan", axis=1, inplace=True)

Ideally, the company name should still be identifiable. Since numbers could cause a false correlation, the column is dropped.

In [None]:
# List of columns to be removed
columns_to_remove = ["Company Name", "Change 2013/14", "Change 2014/15", "Change 2015/16", "Change 2016/17", "Change 2017/18"]

# Drop the specified columns from the DataFrame
df_pp3 = df_pp3.drop(columns_to_remove, axis=1)

### 3.5 Checking for multicollinearity

In [None]:
df_pp4 = df_pp3.copy()

In [None]:
df_pp4.shape

In [None]:
#Since there are too many columns the correlation matrix is displayed without names and therefore rather used as a heatmap
correlation_matrix = df_pp4.corr()

plt.figure(figsize=(40, 32))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", xticklabels=False, yticklabels=False)
plt.title("Korrelationsmatrix")
plt.show()

In [None]:
# Berechne die Korrelationsmatrix
correlation_matrix = df_pp4.corr()

# Erstelle eine leere Liste, um die Ergebnisse zu speichern
correlation_results = []

# Iteriere über die Spalten der Korrelationsmatrix und berechne die Korrelation zwischen jedem Feature-Paar
for i, feature1 in enumerate(correlation_matrix.columns):
    for j, feature2 in enumerate(correlation_matrix.columns):
        if i < j:
            correlation_value = correlation_matrix.iloc[i, j]
            correlation_results.append([feature1, feature2, correlation_value])

# Erstelle ein DataFrame mit den Korrelationsergebnissen
correlation_df = pd.DataFrame(correlation_results, columns=['Feature 1', 'Feature 2', 'Korrelationswert'])

# Zeige das DataFrame mit den Korrelationsergebnissen an
correlation_df

In [None]:
# Filtere die Korrelationswerte: Über 0,7 und nicht gleich 1
filtered_correlation_df = correlation_df[
    (correlation_df['Korrelationswert'] > 0.7) & (correlation_df['Korrelationswert'] < 1)
]

# Zeige das DataFrame mit den gefilterten Korrelationsergebnissen an
filtered_correlation_df

In [None]:
filtered_correlation_df.shape

In the dataset, there are metrics for multiple years, and it is observed that these metrics exhibit strong correlations among themselves. From a domain perspective, this is understandable and indicates a stable company. It is important to note that the LinkedIn and Finance KPIs do not show a high correlation to each other.

In [None]:
# Filter the rows where 'Change_2018/19_down grade' appears in either 'Feature 1' or 'Feature 2'
correlation_results_filtered = correlation_df[(correlation_df['Feature 1'] == 'Default') | 
                                              (correlation_df['Feature 2'] == 'Default')]

# Sort the results based on the absolute value of correlation in descending order
correlation_results_filtered = correlation_results_filtered.iloc[correlation_results_filtered['Korrelationswert'].abs().argsort()[::-1]]
correlation_results_filtered

To prevent correlation between the one hot coded target variable, the Change_2018/19_first rating column is removed.

In [None]:
count_of_ones = df_pp4['Default'].value_counts().get(1, 0)
print("Number of defaults", count_of_ones)

In [None]:
df_pp4.shape

In [None]:
# Entfernen Sie Sonderzeichen aus den Spaltennamen
df_pp4.columns = df_pp4.columns.str.replace('[^a-zA-Z0-9]', '', regex=True)
df_pp4.head(0)


In [None]:
# Liste der Spalten, die auf NaN-Werte überprüft werden sollen
columns_to_check = ['Rating2013', 'Rating2014', 'Rating2015', 'Rating2016', 'Rating2017', 'Rating2018', 'Rating2019',
                    'Change201819', 'MarketCapitalization12312014EURmmHistoricalrate',
                    'MarketCapitalization12312015EURmmHistoricalrate', 'MarketCapitalization12312016EURmmHistoricalrate',
                    'MarketCapitalizationMySetting12312017EURmmHistoricalrate',
                    'MarketCapitalizationMySetting12312018EURmmHistoricalrate', 'EBITDACapitalIQCY2014EURmmHistoricalrate',
                    'EBITDACapitalIQCY2015EURmmHistoricalrate', 'EBITDACapitalIQCY2016EURmmHistoricalrate',
                    'EBITDACapitalIQCY2017EURmmHistoricalrate', 'EBITDACapitalIQCY2018EURmmHistoricalrate',
                    'EBITCapitalIQCY2014EURmmHistoricalrate', 'EBITCapitalIQCY2015EURmmHistoricalrate',
                    'EBITCapitalIQCY2016EURmmHistoricalrate', 'EBITCapitalIQCY2017EURmmHistoricalrate',
                    'EBITCapitalIQCY2018EURmmHistoricalrate', 'NetIncomeCapitalIQCY2014EURmmHistoricalrate',
                    'NetIncomeCapitalIQCY2015EURmmHistoricalrate', 'NetIncomeCapitalIQCY2016EURmmHistoricalrate',
                    'NetIncomeCapitalIQCY2017EURmmHistoricalrate', 'NetIncomeCapitalIQCY2018EURmmHistoricalrate',
                    'TotalEquityCapitalIQCY2014EURmmHistoricalrate', 'TotalEquityCapitalIQCY2015EURmmHistoricalrate',
                    'TotalEquityCapitalIQCY2016EURmmHistoricalrate', 'TotalEquityCapitalIQCY2017EURmmHistoricalrate',
                    'TotalEquityCapitalIQCY2018EURmmHistoricalrate', 'TotalDebtCapitalIQCY2014EURmmHistoricalrate',
                    'TotalDebtCapitalIQCY2015EURmmHistoricalrate', 'TotalDebtCapitalIQCY2016EURmmHistoricalrate',
                    'TotalDebtCapitalIQCY2017EURmmHistoricalrate', 'TotalDebtCapitalIQCY2018EURmmHistoricalrate',
                    'TotalAssetsCapitalIQCY2014EURmmHistoricalrate', 'TotalAssetsCapitalIQCY2015EURmmHistoricalrate',
                    'TotalAssetsCapitalIQCY2016EURmmHistoricalrate', 'TotalAssetsCapitalIQCY2017EURmmHistoricalrate',
                    'TotalAssetsCapitalIQCY2018EURmmHistoricalrate', 'NetDebtCapitalIQCY2014EURmmHistoricalrate',
                    'NetDebtCapitalIQCY2015EURmmHistoricalrate', 'NetDebtCapitalIQCY2016EURmmHistoricalrate',
                    'NetDebtCapitalIQCY2017EURmmHistoricalrate', 'NetDebtCapitalIQCY2018EURmmHistoricalrate',
                    'GrossProfitCapitalIQCY2014EURmmHistoricalrate', 'GrossProfitCapitalIQCY2015EURmmHistoricalrate',
                    'GrossProfitCapitalIQCY2016EURmmHistoricalrate', 'GrossProfitCapitalIQCY2017EURmmHistoricalrate',
                    'GrossProfitCapitalIQCY2018EURmmHistoricalrate', 'TotalEmployeesCapitalIQCY2014',
                    'TotalEmployeesCapitalIQCY2015', 'TotalEmployeesCapitalIQCY2016', 'TotalEmployeesCapitalIQCY2017',
                    'TotalEmployeesCapitalIQCY2018', 'CashfromOpsCapitalIQCY2014EURmmHistoricalrate',
                    'CashfromOpsCapitalIQCY2015EURmmHistoricalrate', 'CashfromOpsCapitalIQCY2016EURmmHistoricalrate',
                    'CashfromOpsCapitalIQCY2017EURmmHistoricalrate', 'CashfromOpsCapitalIQCY2018EURmmHistoricalrate',
                    'TotalRevenueCapitalIQCY2014EURmmHistoricalrate', 'TotalRevenueCapitalIQCY2015EURmmHistoricalrate',
                    'TotalRevenueCapitalIQCY2016EURmmHistoricalrate', 'TotalRevenueCapitalIQCY2017EURmmHistoricalrate',
                    'TotalRevenueCapitalIQCY2018EURmmHistoricalrate', 'Equityratio2018', 'DebtratioinProzent2018',
                    'Debtequityratio2018', 'Returnonequity2018', 'Returnonsales2018', 'Default',
                    'Numberofemployees2014', 'Numberofemployees2015', 'Numberofemployees2016', 'Numberofemployees2017',
                    'Numberofemployees2018', 'Employeedevelopment2015', 'Employeedevelopment2016',
                    'Employeedevelopment2017', 'Employeedevelopment2018', 'Numberofnotices2014', 'Numberofnotices2015',
                    'Numberofnotices2016', 'Numberofnotices2017', 'Migratingworkexperience2014',
                    'Migratingworkexperience2015', 'Migratingworkexperience2016', 'Migratingworkexperience2017',
                    'NumberofNewJoiners2015', 'NumberofNewJoiners2016', 'NumberofNewJoiners2017',
                    'NumberofNewJoiners2018', 'Newjoiningworkexperience2015', 'Newjoiningworkexperience2016',
                    'Newjoiningworkexperience2017', 'Newjoiningworkexperience2018', 'Fluctuationrate2014',
                    'Fluctuationrate2015', 'Fluctuationrate2016', 'Fluctuationrate2017', 'Fluctuationrate2018',
                    'Averageyearsofservicewiththecompany', 'Morethanoncedifferentposition']

# Schritt 1: Überprüfe, ob alle Spalten in columns_to_check NaN-Werte haben
rows_with_all_nan = df_pp4[df_pp4[columns_to_check].isnull().all(axis=1)]

# Schritt 2: Entferne die Zeilen mit allen NaN-Werten aus dem DataFrame df_pp4
df_pp4 = df_pp4.drop(rows_with_all_nan.index)

In [None]:
# Schritt 1: Überprüfe, ob es Zeilen mit NaN-Werten gibt
rows_with_nan = df_pp4[df_pp4.isnull().any(axis=1)]

# Schritt 2: Ersetze NaN-Werte in rows_with_nan durch Nullen
df_pp4.fillna(0, inplace=True)
df_pp4


In [None]:
count_of_ones = df_pp4['Default'].value_counts().get(1, 0)
print("Number of defaults", count_of_ones)

In [None]:
variables = df_pp4[['NetDebtCapitalIQCY2014EURmmHistoricalrate','TotalEmployeesCapitalIQCY2014', 'EBITDACapitalIQCY2014EURmmHistoricalrate', 'EBITCapitalIQCY2014EURmmHistoricalrate', 'NetIncomeCapitalIQCY2014EURmmHistoricalrate', 'TotalEquityCapitalIQCY2014EURmmHistoricalrate', 'TotalDebtCapitalIQCY2014EURmmHistoricalrate', 'TotalAssetsCapitalIQCY2014EURmmHistoricalrate', 'GrossProfitCapitalIQCY2014EURmmHistoricalrate', 'CashfromOpsCapitalIQCY2014EURmmHistoricalrate', 'TotalRevenueCapitalIQCY2014EURmmHistoricalrate','Numberofemployees2014', 'Numberofnotices2014', 'Migratingworkexperience2014', 'NumberofNewJoiners2015', 'Newjoiningworkexperience2015', 'Fluctuationrate2014', 'Averageyearsofservicewiththecompany', 'Morethanoncedifferentposition', 'Default']]

vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(variables.values, i) for i in range(variables.shape[1])]
vif['Features'] = variables.columns

In [None]:
vif

In [None]:
df_pp4 = df_pp4.drop(['EBITDACapitalIQCY2014EURmmHistoricalrate',
                    'EBITDACapitalIQCY2015EURmmHistoricalrate', 'EBITDACapitalIQCY2016EURmmHistoricalrate',
                    'EBITDACapitalIQCY2017EURmmHistoricalrate', 'EBITDACapitalIQCY2018EURmmHistoricalrate','CashfromOpsCapitalIQCY2014EURmmHistoricalrate',
                    'CashfromOpsCapitalIQCY2015EURmmHistoricalrate', 'CashfromOpsCapitalIQCY2016EURmmHistoricalrate',
                    'CashfromOpsCapitalIQCY2017EURmmHistoricalrate', 'CashfromOpsCapitalIQCY2018EURmmHistoricalrate','Numberofemployees2014', 'Numberofemployees2015', 'Numberofemployees2016', 'Numberofemployees2017',
                    'Numberofemployees2018',],axis = 1)

In [None]:
variables = df_pp4[['EBITCapitalIQCY2014EURmmHistoricalrate', 'NetIncomeCapitalIQCY2014EURmmHistoricalrate', 'TotalEquityCapitalIQCY2014EURmmHistoricalrate', 'TotalDebtCapitalIQCY2014EURmmHistoricalrate', 'TotalAssetsCapitalIQCY2014EURmmHistoricalrate', 'GrossProfitCapitalIQCY2014EURmmHistoricalrate', 'TotalRevenueCapitalIQCY2014EURmmHistoricalrate', 'Numberofnotices2014', 'Migratingworkexperience2014', 'NumberofNewJoiners2015', 'Newjoiningworkexperience2015', 'Fluctuationrate2014', 'Averageyearsofservicewiththecompany', 'Morethanoncedifferentposition', 'Default']]

vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(variables.values, i) for i in range(variables.shape[1])]
vif['Features'] = variables.columns

In [None]:
vif

In [None]:
df_pp4 = df_pp4.drop(['TotalAssetsCapitalIQCY2014EURmmHistoricalrate', 'TotalAssetsCapitalIQCY2015EURmmHistoricalrate',
                    'TotalAssetsCapitalIQCY2016EURmmHistoricalrate', 'TotalAssetsCapitalIQCY2017EURmmHistoricalrate',
                    'TotalAssetsCapitalIQCY2018EURmmHistoricalrate'],axis = 1)

In [None]:
variables = df_pp4[['EBITCapitalIQCY2014EURmmHistoricalrate', 'NetIncomeCapitalIQCY2014EURmmHistoricalrate', 'TotalEquityCapitalIQCY2014EURmmHistoricalrate', 'TotalDebtCapitalIQCY2014EURmmHistoricalrate', 'GrossProfitCapitalIQCY2014EURmmHistoricalrate', 'TotalRevenueCapitalIQCY2014EURmmHistoricalrate', 'Numberofnotices2014', 'Migratingworkexperience2014', 'NumberofNewJoiners2015','Newjoiningworkexperience2015', 'Fluctuationrate2014', 'Averageyearsofservicewiththecompany', 'Morethanoncedifferentposition', 'Default']]

vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(variables.values, i) for i in range(variables.shape[1])]
vif['Features'] = variables.columns

In [None]:
vif

In [None]:
df_pp4 = df_pp4.drop(['EBITCapitalIQCY2014EURmmHistoricalrate', 'EBITCapitalIQCY2015EURmmHistoricalrate',
                    'EBITCapitalIQCY2016EURmmHistoricalrate', 'EBITCapitalIQCY2017EURmmHistoricalrate',
                    'EBITCapitalIQCY2018EURmmHistoricalrate'],axis = 1)

In [None]:
variables = df_pp4[['NetIncomeCapitalIQCY2014EURmmHistoricalrate', 'TotalEquityCapitalIQCY2014EURmmHistoricalrate', 'TotalDebtCapitalIQCY2014EURmmHistoricalrate', 'GrossProfitCapitalIQCY2014EURmmHistoricalrate', 'TotalRevenueCapitalIQCY2014EURmmHistoricalrate', 'Numberofnotices2014', 'Migratingworkexperience2014', 'NumberofNewJoiners2015','Newjoiningworkexperience2015', 'Fluctuationrate2014', 'Averageyearsofservicewiththecompany', 'Morethanoncedifferentposition', 'Default']]

vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(variables.values, i) for i in range(variables.shape[1])]
vif['Features'] = variables.columns

In [None]:
vif

Rather too high a correlation is accepted.

# 4 Modeling

In the following, the required data frames are first formed on the basis of the cleaned data (4.0). Then the ML models are first applied to the financial ratios (4.1), then to the LinkedIn features (4.2) and finally to the combined data (4.3).

In [None]:
dfm = df_pp4.copy()

In [None]:
dfm.shape

## 4.0 Building dataframes

Create the required data frames:
- df_financials
- df_linkedin
- df_com

In [None]:
# getting column names
# column_names = ["{}".format(col) for col in dfm.columns]
# print(column_names)

In [None]:
#Columns for financials (ratings can not be used, because there are no ratings for defaulted companies)
selected_columns = ['NetIncomeCapitalIQCY2014EURmmHistoricalrate', 'NetIncomeCapitalIQCY2015EURmmHistoricalrate', 'NetIncomeCapitalIQCY2016EURmmHistoricalrate', 'NetIncomeCapitalIQCY2017EURmmHistoricalrate', 'TotalEquityCapitalIQCY2014EURmmHistoricalrate', 'TotalEquityCapitalIQCY2015EURmmHistoricalrate', 'TotalEquityCapitalIQCY2016EURmmHistoricalrate', 'TotalEquityCapitalIQCY2017EURmmHistoricalrate', 'TotalDebtCapitalIQCY2014EURmmHistoricalrate', 'TotalDebtCapitalIQCY2015EURmmHistoricalrate', 'TotalDebtCapitalIQCY2016EURmmHistoricalrate', 'TotalDebtCapitalIQCY2017EURmmHistoricalrate', 'GrossProfitCapitalIQCY2014EURmmHistoricalrate', 'GrossProfitCapitalIQCY2015EURmmHistoricalrate', 'GrossProfitCapitalIQCY2016EURmmHistoricalrate', 'GrossProfitCapitalIQCY2017EURmmHistoricalrate', 'TotalRevenueCapitalIQCY2014EURmmHistoricalrate', 'TotalRevenueCapitalIQCY2015EURmmHistoricalrate', 'TotalRevenueCapitalIQCY2016EURmmHistoricalrate', 'TotalRevenueCapitalIQCY2017EURmmHistoricalrate', 'Default',]

# Creat new dataframe
df_financials = dfm[selected_columns]
df_financials.head(4)

In [None]:
# Replace 'C:\\Users\\chiar\\OneDrive\\Masterthesis\\Modell\\Dataframes\\Default' with your desired path
file_path = 'C:\\Users\\chiar\\OneDrive\\Masterthesis\\Modell\\Dataframes\\Default\\Financials\\default_financials.csv'

# Save the DataFrame to a CSV file
df_financials.to_csv(file_path, index=False)


In [None]:
#Columns for linkedin
selected_columns = ['Employeedevelopment2015', 'Employeedevelopment2016', 'Employeedevelopment2017', 'Numberofnotices2014', 'Numberofnotices2015', 'Numberofnotices2016', 'Numberofnotices2017', 'Migratingworkexperience2014', 'Migratingworkexperience2015', 'Migratingworkexperience2016', 'Migratingworkexperience2017', 'NumberofNewJoiners2015', 'NumberofNewJoiners2016', 'NumberofNewJoiners2017', 'Newjoiningworkexperience2015', 'Newjoiningworkexperience2016', 'Newjoiningworkexperience2017', 'Fluctuationrate2014', 'Fluctuationrate2015', 'Fluctuationrate2016', 'Fluctuationrate2017', 'Averageyearsofservicewiththecompany', 'Morethanoncedifferentposition', 'Default']

# Creat new dataframe
df_linkedin = dfm[selected_columns]
df_linkedin.head(4)

In [None]:
# Replace 'C:\\Users\\chiar\\OneDrive\\Masterthesis\\Modell\\Dataframes\\Default' with your desired path
file_path = 'C:\\Users\\chiar\\OneDrive\\Masterthesis\\Modell\\Dataframes\\Default\\LinkedIn\\default_LinkedIn.csv'

# Save the DataFrame to a CSV file
df_linkedin.to_csv(file_path, index=False)

In [None]:
#Columns for combined
selected_columns = ['Employeedevelopment2015', 'Employeedevelopment2016', 'Employeedevelopment2017', 'Numberofnotices2014', 'Numberofnotices2015', 'Numberofnotices2016', 'Numberofnotices2017', 'Migratingworkexperience2014', 'Migratingworkexperience2015', 'Migratingworkexperience2016', 'Migratingworkexperience2017', 'NumberofNewJoiners2015', 'NumberofNewJoiners2016', 'NumberofNewJoiners2017', 'Newjoiningworkexperience2015', 'Newjoiningworkexperience2016', 'Newjoiningworkexperience2017', 'Fluctuationrate2014', 'Fluctuationrate2015', 'Fluctuationrate2016', 'Fluctuationrate2017', 'Averageyearsofservicewiththecompany', 'Morethanoncedifferentposition', 'NetIncomeCapitalIQCY2014EURmmHistoricalrate', 'NetIncomeCapitalIQCY2015EURmmHistoricalrate', 'NetIncomeCapitalIQCY2016EURmmHistoricalrate', 'NetIncomeCapitalIQCY2017EURmmHistoricalrate', 'TotalEquityCapitalIQCY2014EURmmHistoricalrate', 'TotalEquityCapitalIQCY2015EURmmHistoricalrate', 'TotalEquityCapitalIQCY2016EURmmHistoricalrate', 'TotalEquityCapitalIQCY2017EURmmHistoricalrate', 'TotalDebtCapitalIQCY2014EURmmHistoricalrate', 'TotalDebtCapitalIQCY2015EURmmHistoricalrate', 'TotalDebtCapitalIQCY2016EURmmHistoricalrate', 'TotalDebtCapitalIQCY2017EURmmHistoricalrate', 'GrossProfitCapitalIQCY2014EURmmHistoricalrate', 'GrossProfitCapitalIQCY2015EURmmHistoricalrate', 'GrossProfitCapitalIQCY2016EURmmHistoricalrate', 'GrossProfitCapitalIQCY2017EURmmHistoricalrate', 'GrossProfitCapitalIQCY2018EURmmHistoricalrate','TotalRevenueCapitalIQCY2014EURmmHistoricalrate', 'TotalRevenueCapitalIQCY2015EURmmHistoricalrate', 'TotalRevenueCapitalIQCY2016EURmmHistoricalrate', 'TotalRevenueCapitalIQCY2017EURmmHistoricalrate', 'TotalRevenueCapitalIQCY2018EURmmHistoricalrate', 'Default']

# Creat new dataframe
df_com = dfm[selected_columns]
df_com.head(4)

In [None]:
df_com.shape

In [None]:
# Replace 'C:\\Users\\chiar\\OneDrive\\Masterthesis\\Modell\\Dataframes\\Default' with your desired path
file_path = 'C:\\Users\\chiar\\OneDrive\\Masterthesis\\Modell\\Dataframes\\Default\\Com\\downgrade_combined.csv'

# Save the DataFrame to a CSV file
df_com.to_csv(file_path, index=False)

## 4.1 Running modells on df_financials

### 4.1.1 Splitting between train and test data

In [None]:
# Isolation of the property to be predicted
target = df_financials['Default']

# Isolation of all properties that contribute to the prediction.
predictors = df_financials.drop(['Default'], axis=1)

In [None]:
predictors.head()

In [None]:
# Generation of the following dataframes:
# X_train = training data from all properties that are not the target column (80%).
# X_test = analog X_train, but only 20%.
# Y_train =Training data from the target variable (80%)
# Y_test = analog Y_train, but only 20%.
X_train_pre, X_test, y_train_pre, y_test = train_test_split(predictors, target, test_size=0.2, random_state=356)

In [None]:
X_train_pre.shape

In [None]:
y_train_pre.shape

### 4.1.2 Generating synthetic data of train data

In [None]:
# Erstellen Sie eine Instanz der ADASYN-Klasse
adasyn = ADASYN(random_state=42)

# Anwenden von ADASYN, um synthetische Daten zu generieren
X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train_pre, y_train_pre)

# Sie können auch die generierten numpy-Arrays wieder in Dataframes umwandeln, falls erforderlich
X_train = pd.DataFrame(X_train_adasyn, columns=X_train_pre.columns)
y_train = pd.Series(y_train_adasyn, name=y_train_pre.name)


In [None]:
temp1 = pd.DataFrame(y_train_pre)
temp2 = pd.DataFrame(y_train)

print('Before SMOTE')
print(temp1['Default'].value_counts())
print('After SMOTE')
print(temp2['Default'].value_counts())

In [None]:
temp3 = pd.DataFrame(y_test)

print('Check for test data')
print(temp3['Default'].value_counts())

In [None]:
# Dein vorhandener Code
temp3 = pd.DataFrame(y_test)

print('Check for test data')
print(temp3['Default'].value_counts())

# Erstelle ein Kreisdiagramm
plt.figure(figsize=(6, 6))
temp3['Default'].value_counts().plot(kind='pie', autopct='%1.1f%%', startangle=140, colors=['skyblue', 'lightcoral'])
plt.title('Distribution of Default in Test Data')
plt.ylabel('')  # Entferne die Y-Achsenbeschriftung
plt.show()

Oversampling of train data succsessfull. Test data still unbalanced.

In [None]:
X_test.to_csv(r'C:\Users\chiar\OneDrive\Masterthesis\Modell\Dataframes\Default\Financials\x_test_financials.csv', index=False)
X_train.to_csv(r'C:\Users\chiar\OneDrive\Masterthesis\Modell\Dataframes\Default\Financials\x_train_financials.csv', index=False)
y_test.to_csv(r'C:\Users\chiar\OneDrive\Masterthesis\Modell\Dataframes\Default\Financials\y_test_financials.csv', index=False)
y_train.to_csv(r'C:\Users\chiar\OneDrive\Masterthesis\Modell\Dataframes\Default\Financials\y_train_financials.csv', index=False)

### 4.1.4  Logistic Regression

In [None]:
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)
warnings.filterwarnings("ignore")

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
# Here the X-target variable is compared with the predicted values
cm = confusion_matrix(y_test, y_pred)
 
print ("Confusion Matrix : \n", cm)

In [None]:
#Visualization
plot_confusion_matrix(classifier,X_test,y_test,cmap='Blues')
plt.grid(False)

In [None]:
tn, fp, fn, tp = cm.ravel()
recall = tp/(fn+tp)
precision = tp/(tp+fp)
print("True Negatives: " + str(tn))
print("False Positives: " + str(fp))
print("False Negatives: " + str(fn))
print("True Positives: " + str(tp))
print("Recall: " + str(recall))
print("Precision: " + str(precision))

In [None]:
# Perform 5-fold cross-validation on the training data
cross_val_scores = cross_val_score(classifier, X_train, y_train, cv=5)

# Print the results of cross-validation
print("Cross-validation ACC scores:", cross_val_scores)
print("Average Accuracy:", cross_val_scores.mean())

In [None]:
# Define the recall metric for use with cross_val_score
recall_scorer = make_scorer(recall_score)

# Perform 5-fold cross-validation on the training data
cross_val_scores_recall = cross_val_score(classifier, X_train, y_train, cv=5, scoring=recall_scorer)

# Print the results of cross-validation
print("Cross-validation Recall scores:", cross_val_scores_recall)
print("Average Cross-validation Recall score:", cross_val_scores_recall.mean())

In [None]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, y_pred)
print ("Accuracy : ", acc)

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
# Calculate the predicted probabilities for the positive class (class 1)
y_pred_prob = classifier.predict_proba(X_test)[:, 1]

# Calculate the AUC score
auc_score = roc_auc_score(y_test, y_pred_prob)
print("AUC Score:", auc_score)

For verification, we check how the target variable of the training data is predicted. Therefore, the y_train is predicted using logistic regression and using the properties (x_train).

In [None]:
y_train_pred = classifier.predict(X_train)

In [None]:
# Comparison and results check 
print(classification_report(y_train,y_train_pred))

In [None]:
Precisions_financials = {
    "Logistic Regression": [0.43, 0.38, 0.88, 0.88],}
# precision, recall, then acc, then auc

In [None]:
Robustness_financials = {} 
Robustness_financials["Logistic Regression"]=[0.85, 0.88]
# recall, acc

### 4.1.5 Decision Tree

In [None]:
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)

In [None]:
y_pred_tree = tree.predict(X_test)

In [None]:
plot_confusion_matrix(tree,X_test,y_test,cmap='Blues')
plt.grid(False)

In [None]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, y_pred_tree)
print ("Accuracy : ", acc)

In [None]:
# Führe eine 5-fache Cross-Validation auf den Trainingsdaten durch
cross_val_scores = cross_val_score(tree, X_train, y_train, cv=5)

# Gib die Ergebnisse der Cross-Validation aus
print("Cross-validation scores:", cross_val_scores)
print("Average Cross-validation score:", cross_val_scores.mean())

In [None]:
print(classification_report(y_test, y_pred_tree))

In [None]:
# Calculate the predicted probabilities for the positive class (class 1)
y_pred_prob_tree = tree.predict_proba(X_test)[:, 1]

# Calculate the AUC score for the Decision Tree classifier
auc_score_tree = roc_auc_score(y_test, y_pred_prob_tree)
print("AUC Score for Decision Tree:", auc_score_tree)

In [None]:
y_train_pred_tree = tree.predict(X_train)

In [None]:
plot_confusion_matrix(tree,X_train,y_train, cmap='Blues')
plt.grid(False)

In [None]:
print(classification_report(y_train, y_train_pred_tree))

In [None]:
# Perform 5-fold cross-validation on the training data
cross_val_scores = cross_val_score(tree, X_train, y_train, cv=5)

# Print the results of cross-validation
print("Cross-validation ACC scores:", cross_val_scores)
print("Average Accuracy:", cross_val_scores.mean())

In [None]:
# Define the recall metric for use with cross_val_score
recall_scorer = make_scorer(recall_score)

# Perform 5-fold cross-validation on the training data
cross_val_scores_recall = cross_val_score(tree, X_train, y_train, cv=5, scoring=recall_scorer)

# Print the results of cross-validation
print("Cross-validation Recall scores:", cross_val_scores_recall)
print("Average Cross-validation Recall score:", cross_val_scores_recall.mean())

In [None]:
Precisions_financials.update({
    "Decision Tree": [0.44, 0.88, 0.87, 0.87]
})
#precision, recall, acc, auc

In [None]:
Robustness_financials["Decision Tree"]= [0.89, 0.90]
# recall, acc

### 4.1.6 Random Forest

In [None]:
tree_depth = [5, 10, 20, 30]
for i in tree_depth:
    rf = RandomForestClassifier(max_depth=i)
    rf.fit(X_train, y_train)
    print('Max tree depth: ', i)
    print('Train results: ', classification_report(y_train, rf.predict(X_train)))
    print('Test results: ',classification_report(y_test, rf.predict(X_test)))

In [None]:
feature_scores = pd.Series(rf.feature_importances_, index=X_train.columns).sort_values(ascending=False)
feature_scores

In [None]:
# Calculate the predicted probabilities for the positive class (class 1)
y_pred_prob_rf = rf.predict_proba(X_test)[:, 1]

# Calculate the AUC score for the RandomForestClassifier
auc_score_rf = roc_auc_score(y_test, y_pred_prob_rf)
print('AUC Score for Random Forest:', auc_score_rf)

In [None]:
# Perform 5-fold cross-validation on the training data
cross_val_scores = cross_val_score(rf, X_train, y_train, cv=5)

# Print the results of cross-validation
print("Cross-validation ACC scores:", cross_val_scores)
print("Average Accuracy:", cross_val_scores.mean())

In [None]:
# Define the recall metric for use with cross_val_score
recall_scorer = make_scorer(recall_score)

# Perform 5-fold cross-validation on the training data
cross_val_scores_recall = cross_val_score(rf, X_train, y_train, cv=5, scoring=recall_scorer)

# Print the results of cross-validation
print("Cross-validation Recall scores:", cross_val_scores_recall)
print("Average Cross-validation Recall score:", cross_val_scores_recall.mean())

In [None]:
Precisions_financials.update({
    "Random Forest": [0.50, 0.88, 0.89, 0.94],
})

In [None]:
Robustness_financials["Random Forest"]=[0.91, 0.94]
# recall, acc

### 4.1.7 XGBoost 

In [None]:
xgb_classifier = xgb.XGBClassifier()
cross_val_scores = cross_val_score(xgb_classifier, X_train, y_train, cv=5)

In [None]:
# Perform 5-fold cross-validation on the training data for accuracy
cross_val_scores_acc = cross_val_score(xgb_classifier, X_train, y_train, cv=5)
print("Cross-validation ACC scores:", cross_val_scores_acc)
print("Average Cross-validation ACC score:", cross_val_scores_acc.mean())

# Perform 5-fold cross-validation on the training data for AUC
cross_val_scores_auc = cross_val_score(xgb_classifier, X_train, y_train, cv=5, scoring='roc_auc')
print("Cross-validation AUC scores:", cross_val_scores_auc)
print("Average Cross-validation AUC score:", cross_val_scores_auc.mean())

# Perform 5-fold cross-validation on the training data for Recall
cross_val_scores_recall = cross_val_score(xgb_classifier, X_train, y_train, cv=5, scoring='recall')
print("Cross-validation Recall scores:", cross_val_scores_recall)
print("Average Cross-validation Recall score:", cross_val_scores_recall.mean())


In [None]:
# Fit the XGBoost classifier to the training data
xgb_classifier.fit(X_train, y_train)

# Plot feature importance
importance = plot_importance(xgb_classifier, height=0.9)
plt.show()

In [None]:
# Print classification reports for train and test sets
print('Train results: ', classification_report(y_train, xgb_classifier.predict(X_train)))
print('Test results: ', classification_report(y_test, xgb_classifier.predict(X_test)))

# Calculate the predicted probabilities for the positive class (class 1)
y_pred_prob_xgb = xgb_classifier.predict_proba(X_test)[:, 1]

In [None]:
# Calculate the AUC score for the XGBoost classifier
auc_score_xgb = roc_auc_score(y_test, y_pred_prob_xgb)
print('AUC Score for XGBoost:', auc_score_xgb)

In [None]:
Precisions_financials.update({
    "XGBoost": [0.47, 0.88, 0.88, 0.94],
})

In [None]:
Robustness_financials["XGBoost"]= [0.92, 0.93]
# recall, acc

## 4.2 Running modells on df_linkedin

### 4.2.1 Splitting between train and test data

In [None]:
# Isolation of the property to be predicted
target = df_linkedin['Default'] 
# Isolation of all properties that contribute to the prediction.
predictors = df_linkedin.drop(['Default'], axis = 1) 

In [None]:
predictors.head()

In [None]:
# Generation of the following dataframes:
# X_trainL = training data from all properties that are not the target column (80%).
# X_testL = analog X_train, but only 20%.
# Y_trainL =Training data from the target variable (80%)
# Y_testL = analog Y_train, but only 20%.
X_trainL_pre, X_testL, y_trainL_pre, y_testL = train_test_split(predictors, target, test_size=0.2, random_state=356)

### 4.2.2 Generating syntetic data of train data

In [None]:
# Erstellen Sie eine Instanz der ADASYN-Klasse
adasyn = ADASYN(random_state=42)

# Anwenden von ADASYN, um synthetische Daten zu generieren
X_trainL_adasyn, y_trainL_adasyn = adasyn.fit_resample(X_trainL_pre, y_trainL_pre)

# Sie können auch die generierten numpy-Arrays wieder in Dataframes umwandeln, falls erforderlich
X_trainL = pd.DataFrame(X_trainL_adasyn, columns=X_trainL_pre.columns)
y_trainL = pd.Series(y_trainL_adasyn, name=y_trainL_pre.name)

In [None]:
temp1 = pd.DataFrame(y_trainL_pre)
temp2 = pd.DataFrame(y_trainL)

print('Before SMOTE')
print(temp1['Default'].value_counts())
print('After SMOTE')
print(temp2['Default'].value_counts())

In [None]:
temp3 = pd.DataFrame(y_testL)

print('Check for test data')
print(temp3['Default'].value_counts())

Oversampling successful.

In [None]:
X_testL.to_csv(r'C:\Users\chiar\OneDrive\Masterthesis\Modell\Dataframes\Default\LinkedIn\x_test_linkedin.csv', index=False)
X_trainL_pre.to_csv(r'C:\Users\chiar\OneDrive\Masterthesis\Modell\Dataframes\Default\LinkedIn\x_train_linkedin.csv', index=False)
y_testL.to_csv(r'C:\Users\chiar\OneDrive\Masterthesis\Modell\Dataframes\Default\LinkedIn\y_test_linkedin.csv', index=False)
y_trainL_pre.to_csv(r'C:\Users\chiar\OneDrive\Masterthesis\Modell\Dataframes\Default\LinkedIn\y_train_linkedin.csv', index=False)

### 4.2.3  Logistische Regression

In [None]:
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_trainL, y_trainL)
warnings.filterwarnings("ignore")

In [None]:
y_pred = classifier.predict(X_testL)

In [None]:
# Here the X-target variable is compared with the predicted values
cm = confusion_matrix(y_testL, y_pred)
 
print ("Confusion Matrix : \n", cm)

In [None]:
#Visualization
plot_confusion_matrix(classifier,X_testL,y_testL,cmap='Blues')
plt.grid(False)

In [None]:
tn, fp, fn, tp = cm.ravel()
recall = tp/(fn+tp)
precision = tp/(tp+fp)
print("True Negatives: " + str(tn))
print("False Positives: " + str(fp))
print("False Negatives: " + str(fn))
print("True Positives: " + str(tp))
print("Recall: " + str(recall))
print("Precision: " + str(precision))

In [None]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_testL, y_pred)
print ("Accuracy : ", acc)

In [None]:
print(classification_report(y_testL,y_pred))

In [None]:
# Calculate the predicted probabilities for the positive class (class 1)
y_pred_prob = classifier.predict_proba(X_testL)[:, 1]

# Calculate the AUC score
auc_score = roc_auc_score(y_testL, y_pred_prob)
print("AUC Score:", auc_score)

In [None]:
y_train_pred = classifier.predict(X_trainL)

In [None]:
# Comparison and results check 
print(classification_report(y_trainL,y_train_pred))

In [None]:
# Perform 5-fold cross-validation on the training data
cross_val_scores = cross_val_score(classifier, X_trainL, y_trainL, cv=5)

# Print the results of cross-validation
print("Cross-validation ACC scores:", cross_val_scores)
print("Average Accuracy:", cross_val_scores.mean())

In [None]:
# Define the recall metric for use with cross_val_score
recall_scorer = make_scorer(recall_score)

# Perform 5-fold cross-validation on the training data
cross_val_scores_recall = cross_val_score(classifier, X_trainL, y_trainL, cv=5, scoring=recall_scorer)

# Print the results of cross-validation
print("Cross-validation Recall scores:", cross_val_scores_recall)
print("Average Cross-validation Recall score:", cross_val_scores_recall.mean())

In [None]:
Precisions_linkedin = {}
Precisions_linkedin = {
    "Logistic Regression": [0.22, 0.62, 0.72, 0.75]}
# precision, recall, acc, auc

In [None]:
Robustness_linkedin = {} 
Robustness_linkedin["Logistic Regression"]= [0.85, 0.85]
#recall, acc

### 4.2.4 Decision Tree

In [None]:
tree = DecisionTreeClassifier()
tree.fit(X_trainL, y_trainL)

In [None]:
y_pred_tree = tree.predict(X_testL)

In [None]:
plot_confusion_matrix(tree,X_testL,y_testL,cmap='Blues')
plt.grid(False)

In [None]:
acc = accuracy_score(y_testL, y_pred_tree)
print ("Accuracy : ", acc)

In [None]:
print(classification_report(y_testL, y_pred_tree))

In [None]:
# Calculate the predicted probabilities for the positive class (class 1)
y_pred_prob_tree = tree.predict_proba(X_testL)[:, 1]

# Calculate the AUC score for the Decision Tree classifier
auc_score_tree = roc_auc_score(y_testL, y_pred_prob_tree)
print("AUC Score for Decision Tree:", auc_score_tree)

In [None]:
y_train_pred_tree = tree.predict(X_trainL)
plot_confusion_matrix(tree,X_trainL,y_trainL, cmap='Blues')
plt.grid(False)

In [None]:
# Perform 5-fold cross-validation on the training data
cross_val_scores = cross_val_score(tree, X_trainL, y_trainL, cv=5)

# Print the results of cross-validation
print("Cross-validation ACC scores:", cross_val_scores)
print("Average Accuracy:", cross_val_scores.mean())

In [None]:
# Define the recall metric for use with cross_val_score
recall_scorer = make_scorer(recall_score)

# Perform 5-fold cross-validation on the training data
cross_val_scores_recall = cross_val_score(tree, X_trainL, y_trainL, cv=5, scoring=recall_scorer)

# Print the results of cross-validation
print("Cross-validation Recall scores:", cross_val_scores_recall)
print("Average Cross-validation Recall score:", cross_val_scores_recall.mean())

In [None]:
Precisions_linkedin.update({
    "Decision Tree": [0.14, 0.25, 0.76, 0.54],
})

In [None]:
Robustness_linkedin = {}
Robustness_linkedin["Decision Tree"] = [0.79, 0.85]
#recall, acc

### 4.2.5 Random Forest

In [None]:
tree_depth = [5, 10, 30]
for i in tree_depth:
    rf = RandomForestClassifier(max_depth=i)
    rf.fit(X_trainL, y_trainL)
    print('Max tree depth: ', i)
    print('Train results: ', classification_report(y_trainL, rf.predict(X_trainL)))
    print('Test results: ',classification_report(y_testL, rf.predict(X_testL)))

In [None]:
# Calculate the predicted probabilities for the positive class (class 1)
y_pred_prob_rf = rf.predict_proba(X_testL)[:, 1]

# Calculate the AUC score for the RandomForestClassifier
auc_score_rf = roc_auc_score(y_testL, y_pred_prob_rf)
print('AUC Score for Random Forest:', auc_score_rf)

In [None]:
feature_scores = pd.Series(rf.feature_importances_, index=X_trainL.columns).sort_values(ascending=False)
feature_scores

In [None]:
# Perform 5-fold cross-validation on the training data
cross_val_scores = cross_val_score(rf, X_trainL, y_trainL, cv=5)

# Print the results of cross-validation
print("Cross-validation ACC scores:", cross_val_scores)
print("Average Accuracy:", cross_val_scores.mean())

In [None]:
# Define the recall metric for use with cross_val_score
recall_scorer = make_scorer(recall_score)

# Perform 5-fold cross-validation on the training data
cross_val_scores_recall = cross_val_score(rf, X_trainL, y_trainL, cv=5, scoring=recall_scorer)

# Print the results of cross-validation
print("Cross-validation Recall scores:", cross_val_scores_recall)
print("Average Cross-validation Recall score:", cross_val_scores_recall.mean())

In [None]:
Precisions_linkedin.update({
    "Random Forest": [0.00, 0.00, 0.76, 0.79],
})

In [None]:
Robustness_linkedin["Random Forest"] = [0.86, 0.88]
#recall, acc

### 4.2.5 XGBoost

In [None]:
# Create an instance of the XGBoost classifier
xgb_classifier = xgb.XGBClassifier()

# Perform 5-fold cross-validation on the training data for accuracy
cross_val_scores_acc = cross_val_score(xgb_classifier, X_trainL, y_trainL, cv=5)
print("Cross-validation ACC scores:", cross_val_scores_acc)
print("Average Cross-validation ACC score:", cross_val_scores_acc.mean())

# Perform 5-fold cross-validation on the training data for AUC
cross_val_scores_auc = cross_val_score(xgb_classifier, X_trainL, y_trainL, cv=5, scoring='roc_auc')
print("Cross-validation AUC scores:", cross_val_scores_auc)
print("Average Cross-validation AUC score:", cross_val_scores_auc.mean())

# Perform 5-fold cross-validation on the training data for Recall
cross_val_scores_recall = cross_val_score(xgb_classifier, X_trainL, y_trainL, cv=5, scoring='recall')
print("Cross-validation Recall scores:", cross_val_scores_recall)
print("Average Cross-validation Recall score:", cross_val_scores_recall.mean())

In [None]:
# Fit the XGBoost classifier to the training data
xgb_classifier.fit(X_trainL, y_trainL)

# Plot feature importance
importance = plot_importance(xgb_classifier, height=0.9)
plt.show()

In [None]:
# Print classification reports for train and test sets
print('Train results: ', classification_report(y_trainL, xgb_classifier.predict(X_trainL)))
print('Test results: ', classification_report(y_testL, xgb_classifier.predict(X_testL)))

# Calculate the predicted probabilities for the positive class (class 1)
y_pred_prob_xgb = xgb_classifier.predict_proba(X_testL)[:, 1]

In [None]:

# Calculate the AUC score for the XGBoost classifier
auc_score_xgb = roc_auc_score(y_testL, y_pred_prob_xgb)
print('AUC Score for XGBoost:', auc_score_xgb)

In [None]:
Precisions_linkedin.update({
    "XGBoost": [0.10, 0.12, 0.79, 0.75],
})

In [None]:
Robustness_linkedin["XGBoost"]= [0.87, 0.96]
#recall, acc

## 4.3 Running modells on df_comb

### 4.3.1 Splitting between train and test data

In [None]:
# Isolation of the property to be predicted
target = df_com['Default']

# Isolation of all properties that contribute to the prediction.
predictors = df_com.drop(['Default'], axis=1)

In [None]:
# Generation of the following dataframes:
# X_train = training data from all properties that are not the target column (80%).
# X_test = analog X_train, but only 20%.
# Y_train =Training data from the target variable (80%)
# Y_test = analog Y_train, but only 20%.
X_trainC_pre, X_testC, y_trainC_pre, y_testC = train_test_split(predictors, target, test_size=0.2, random_state=356)

### 4.3.2 Oversampling of train data

In [None]:
# Erstellen Sie eine Instanz der ADASYN-Klasse
adasyn = ADASYN(random_state=42)

# Anwenden von ADASYN, um synthetische Daten zu generieren
X_trainC_adasyn, y_trainC_adasyn = adasyn.fit_resample(X_trainC_pre, y_trainC_pre)

# Sie können auch die generierten numpy-Arrays wieder in Dataframes umwandeln, falls erforderlich
X_trainC = pd.DataFrame(X_trainC_adasyn, columns=X_trainC_pre.columns)
y_trainC = pd.Series(y_trainC_adasyn, name=y_trainC_pre.name)

In [None]:
temp1 = pd.DataFrame(y_trainC_pre)
temp2 = pd.DataFrame(y_trainC)

print('Before SMOTE')
print(temp1['Default'].value_counts())
print('After SMOTE')
print(temp2['Default'].value_counts())

In [None]:
temp3 = pd.DataFrame(y_testC)

print('Check for test data')
print(temp3['Default'].value_counts())

In [None]:
X_testC.to_csv(r'C:\Users\chiar\OneDrive\Masterthesis\Modell\Dataframes\Default\Com\x_test_com.csv', index=False)
X_trainC_pre.to_csv(r'C:\Users\chiar\OneDrive\Masterthesis\Modell\Dataframes\Default\Com\x_train_com.csv', index=False)
y_testC.to_csv(r'C:\Users\chiar\OneDrive\Masterthesis\Modell\Dataframes\Default\Com\y_test_com.csv', index=False)
y_trainC_pre.to_csv(r'C:\Users\chiar\OneDrive\Masterthesis\Modell\Dataframes\Default\Com\y_train_com.csv', index=False)

### 4.3.3  Logistische Regression

In [None]:
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_trainC, y_trainC)
warnings.filterwarnings("ignore")

In [None]:
y_pred = classifier.predict(X_testC)
# Here the X-target variable is compared with the predicted values
cm = confusion_matrix(y_testC, y_pred)
 
print ("Confusion Matrix : \n", cm)

In [None]:
#Visualization
plot_confusion_matrix(classifier,X_testC,y_testC,cmap='Blues')
plt.grid(False)

In [None]:
tn, fp, fn, tp = cm.ravel()
recall = tp/(fn+tp)
precision = tp/(tp+fp)
print("True Negatives: " + str(tn))
print("False Positives: " + str(fp))
print("False Negatives: " + str(fn))
print("True Positives: " + str(tp))
print("Recall: " + str(recall))
print("Precision: " + str(precision))

In [None]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_testC, y_pred)
print ("Accuracy : ", acc)

In [None]:
print(classification_report(y_testC,y_pred))

In [None]:
# Calculate the predicted probabilities for the positive class (class 1)
y_pred_prob = classifier.predict_proba(X_testC)[:, 1]

# Calculate the AUC score
auc_score = roc_auc_score(y_testC, y_pred_prob)
print("AUC Score:", auc_score)

In [None]:
y_train_pred = classifier.predict(X_trainC)
# Comparison and results check 
print(classification_report(y_trainC,y_train_pred))

In [None]:
# Perform 5-fold cross-validation on the training data
cross_val_scores = cross_val_score(classifier, X_trainC, y_trainC, cv=5)

# Print the results of cross-validation
print("Cross-validation ACC scores:", cross_val_scores)
print("Average Accuracy:", cross_val_scores.mean())

In [None]:
# Define the recall metric for use with cross_val_score
recall_scorer = make_scorer(recall_score)

# Perform 5-fold cross-validation on the training data
cross_val_scores_recall = cross_val_score(classifier, X_trainC, y_trainC, cv=5, scoring=recall_scorer)

# Print the results of cross-validation
print("Cross-validation Recall scores:", cross_val_scores_recall)
print("Average Cross-validation Recall score:", cross_val_scores_recall.mean())

In [None]:
Precisions_com= {
    "Logistic Regression": [0.60, 0.38, 0.91, 0.95],}
# first test, then train

In [None]:
Robustness_com = {} 
Robustness_com["Logistic Regression"] = [0.99, 0.97]
#recall, acc

### 4.3.4 Decision Tree

In [None]:
tree = DecisionTreeClassifier()
tree.fit(X_trainC, y_trainC)

In [None]:
y_pred_tree = tree.predict(X_testC)
plot_confusion_matrix(tree,X_testC,y_testC,cmap='Blues')
plt.grid(False)

In [None]:
acc = accuracy_score(y_testC, y_pred_tree)
print ("Accuracy : ", acc)

In [None]:
print(classification_report(y_testC, y_pred_tree))

In [None]:
# Calculate the predicted probabilities for the positive class (class 1)
y_pred_prob_tree = tree.predict_proba(X_testC)[:, 1]

# Calculate the AUC score for the Decision Tree classifier
auc_score_tree = roc_auc_score(y_testC, y_pred_prob_tree)
print("AUC Score for Decision Tree:", auc_score_tree)


In [None]:
y_train_pred_tree = tree.predict(X_trainC)
plot_confusion_matrix(tree,X_trainC,y_trainC, cmap='Blues')
plt.grid(False)

In [None]:
print ("Accuracy : ", accuracy_score(y_trainC, y_train_pred_tree))

In [None]:
print(classification_report(y_trainC, y_train_pred_tree))

In [None]:
# Perform 5-fold cross-validation on the training data
cross_val_scores = cross_val_score(tree, X_trainC, y_trainC, cv=5)

# Print the results of cross-validation
print("Cross-validation ACC scores:", cross_val_scores)
print("Average Accuracy:", cross_val_scores.mean())

In [None]:
# Define the recall metric for use with cross_val_score
recall_scorer = make_scorer(recall_score)

# Perform 5-fold cross-validation on the training data
cross_val_scores_recall = cross_val_score(tree, X_trainC, y_trainC, cv=5, scoring=recall_scorer)

# Print the results of cross-validation
print("Cross-validation Recall scores:", cross_val_scores_recall)
print("Average Cross-validation Recall score:", cross_val_scores_recall.mean())

In [None]:
Precisions_com.update({
    "Decision Tree": [0.27, 0.50, 0.80, 0.67],
})

In [None]:
Robustness_com["Decision Tree"] = [0.88, 0.91]
#recall, acc

### 4.3.5 Random Forest

In [None]:
tree_depth = [5, 10, 20, 30]
for i in tree_depth:
    rf = RandomForestClassifier(max_depth=i)
    rf.fit(X_trainC, y_trainC)
    print('Max tree depth: ', i)
    print('Train results: ', classification_report(y_trainC, rf.predict(X_trainC)))
    print('Test results: ',classification_report(y_testC, rf.predict(X_testC)))

In [None]:
# Calculate the predicted probabilities for the positive class (class 1)
y_pred_prob_rf = rf.predict_proba(X_testC)[:, 1]

# Calculate the AUC score for the RandomForestClassifier
auc_score_rf = roc_auc_score(y_testC, y_pred_prob_rf)
print('AUC Score for Random Forest:', auc_score_rf)

In [None]:
feature_scores = pd.Series(rf.feature_importances_, index=X_trainC.columns).sort_values(ascending=False)
feature_scores

In [None]:
# Perform 5-fold cross-validation on the training data
cross_val_scores = cross_val_score(rf, X_trainC, y_trainC, cv=5)

# Print the results of cross-validation
print("Cross-validation ACC scores:", cross_val_scores)
print("Average Accuracy:", cross_val_scores.mean())

In [None]:
# Define the recall metric for use with cross_val_score
recall_scorer = make_scorer(recall_score)

# Perform 5-fold cross-validation on the training data
cross_val_scores_recall = cross_val_score(rf, X_trainC, y_trainC, cv=5, scoring=recall_scorer)

# Print the results of cross-validation
print("Cross-validation Recall scores:", cross_val_scores_recall)
print("Average Cross-validation Recall score:", cross_val_scores_recall.mean())

In [None]:
Precisions_com.update({
    "Random Forest": [0.55, 0.75, 0.91, 0.96],
})

In [None]:
Robustness_com["Random Forest"] = [0.91, 0.96]
#recall, acc

### 4.3.5 XGBoost 

In [None]:
# Create an instance of the XGBoost classifier
xgb_classifier = xgb.XGBClassifier()

# Perform 5-fold cross-validation on the training data for accuracy
cross_val_scores_acc = cross_val_score(xgb_classifier, X_trainC, y_trainC, cv=5)
print("Cross-validation ACC scores:", cross_val_scores_acc)
print("Average Cross-validation ACC score:", cross_val_scores_acc.mean())

# Perform 5-fold cross-validation on the training data for AUC
cross_val_scores_auc = cross_val_score(xgb_classifier, X_trainC, y_trainC, cv=5, scoring='roc_auc')
print("Cross-validation AUC scores:", cross_val_scores_auc)
print("Average Cross-validation AUC score:", cross_val_scores_auc.mean())

# Perform 5-fold cross-validation on the training data for Recall
cross_val_scores_recall = cross_val_score(xgb_classifier, X_trainC, y_trainC, cv=5, scoring='recall')
print("Cross-validation Recall scores:", cross_val_scores_recall)
print("Average Cross-validation Recall score:", cross_val_scores_recall.mean())

# Perform 5-fold cross-validation on the training data for Precision
precision_scorer = make_scorer(precision_score)
cross_val_scores_precision = cross_val_score(xgb_classifier, X_trainC, y_trainC, cv=5, scoring=precision_scorer)
print("Cross-validation Precision scores:", cross_val_scores_precision)
print("Average Cross-validation Precision score:", cross_val_scores_precision.mean())

In [None]:
# Fit the XGBoost classifier to the training data
xgb_classifier.fit(X_trainC, y_trainC)

# Plot feature importance
importance = plot_importance(xgb_classifier, height=0.9, max_num_features=10)
plt.show()

# Print classification reports for train and test sets
print('Train results: ', classification_report(y_trainC, xgb_classifier.predict(X_trainC)))
print('Test results: ', classification_report(y_testC, xgb_classifier.predict(X_testC)))

In [None]:
# Calculate the predicted probabilities for the positive class (class 1)
y_pred_prob_xgb = xgb_classifier.predict_proba(X_testC)[:, 1]

# Calculate the AUC score for the XGBoost classifier
auc_score_xgb = roc_auc_score(y_testC, y_pred_prob_xgb)
print('AUC Score for XGBoost:', auc_score_xgb)

In [None]:
Precisions_com.update({
    "XGBoost": [0.55, 0.75, 0.91, 0.95],
})

In [None]:
Robustness_com["XGBoost"] = [0.96, 0.95]
#recall, acc

# 5. Evaluating - comparing Modells

The results are compiled and neatly presented.

## 5.1 Results financial data

In [None]:
headers = ["", "Precision (1)", "Recall (1)", "Accuracy", "AUC"]
table = PrettyTable()
table.field_names = headers

# List to store the maximum values in each column
max_values = [0.0] * len(headers)

best_model_auc = ""
max_auc_value = 0.0

for model, metrics in Precisions_financials.items():
    precision_test_data = metrics[0]
    precision_train_data = metrics[1]
    accuracy = metrics[2]
    auc = metrics[3]

    # Update the maximum values for each column
    max_values[1] = max(max_values[1], precision_test_data)
    max_values[2] = max(max_values[2], precision_train_data)
    max_values[3] = max(max_values[3], accuracy)
    max_values[4] = max(max_values[4], auc)

    # Update the best model based on the highest AUC score
    if auc > max_auc_value:
        max_auc_value = auc
        best_model_auc = model

    # Add a row to the table
    table.add_row([model, precision_test_data, precision_train_data, accuracy, auc])

# Mark the highest value in each column in red
for row in table._rows:
    for i in range(1, len(headers)):
        if row[i] == max_values[i]:
            row[i] = f"\033[31m{row[i]}\033[0m"  # Red color for the highest value

# Print the table with the highest values in each column marked in red
print(table)

# Print the "Best model" message in red
print(f"\033[31mBest model based on AUC: {best_model_auc}\033[0m")


In [None]:
df_robustness3 = pd.DataFrame.from_dict(Robustness_financials, orient='index', columns=['Recall', 'Accuracy'])
print(df_robustness3)

## 5.2 Results linkedin data

In [None]:
headers = ["", "Precision (1)", "Recall (1)", "Accuracy", "AUC"]
table = PrettyTable()
table.field_names = headers

# List to store the maximum values in each column
max_values = [0.0] * len(headers)

best_model_auc = ""
max_auc_value = 0.0

for model, metrics in Precisions_linkedin.items():
    precision_test_data = metrics[0]
    precision_train_data = metrics[1]
    accuracy = metrics[2]
    auc = metrics[3]

    # Update the maximum values for each column
    max_values[1] = max(max_values[1], precision_test_data)
    max_values[2] = max(max_values[2], precision_train_data)
    max_values[3] = max(max_values[3], accuracy)
    max_values[4] = max(max_values[4], auc)

    # Update the best model based on the highest AUC score
    if auc > max_auc_value:
        max_auc_value = auc
        best_model_auc = model

    # Add a row to the table
    table.add_row([model, precision_test_data, precision_train_data, accuracy, auc])

# Mark the highest value in each column in red
for row in table._rows:
    for i in range(1, len(headers)):
        if row[i] == max_values[i]:
            row[i] = f"\033[31m{row[i]}\033[0m"  # Red color for the highest value

# Print the table with the highest values in each column marked in red
print(table)

# Print the "Best model" message in red
print(f"\033[31mBest model based on AUC: {best_model_auc}\033[0m")


In [None]:
df_robustness2 = pd.DataFrame.from_dict(Robustness_linkedin, orient='index', columns=['Recall', 'Accuracy'])
print(df_robustness2)

## 5.3 Results combined data

In [None]:
headers = ["", "Precision (1)", "Recall (1)", "Accuracy", "AUC"]
table = PrettyTable()
table.field_names = headers

# List to store the maximum values in each column
max_values = [0.0] * len(headers)

best_model_auc = ""
max_auc_value = 0.0

for model, metrics in Precisions_com.items():
    precision_test_data = metrics[0]
    precision_train_data = metrics[1]
    accuracy = metrics[2]
    auc = metrics[3]

    # Update the maximum values for each column
    max_values[1] = max(max_values[1], precision_test_data)
    max_values[2] = max(max_values[2], precision_train_data)
    max_values[3] = max(max_values[3], accuracy)
    max_values[4] = max(max_values[4], auc)

    # Update the best model based on the highest AUC score
    if auc > max_auc_value:
        max_auc_value = auc
        best_model_auc = model

    # Add a row to the table
    table.add_row([model, precision_test_data, precision_train_data, accuracy, auc])

# Mark the highest value in each column in red
for row in table._rows:
    for i in range(1, len(headers)):
        if row[i] == max_values[i]:
            row[i] = f"\033[31m{row[i]}\033[0m"  # Red color for the highest value

# Print the table with the highest values in each column marked in red
print(table)

# Print the "Best model" message in red
print(f"\033[31mBest model based on AUC: {best_model_auc}\033[0m")

In [None]:
df_robustness = pd.DataFrame.from_dict(Robustness_com, orient='index', columns=['Recall', 'Accuracy'])
print(df_robustness)