# 1. Business Understanding

Business valuations are crucial for a diverse range of stakeholders, guiding capital allocation decisions based on precise assessments of companies' economic performance, regardless of whether the trends are positive, steady, or negative. The potential risks of both overly negative valuations, misinterpreting positive trends, and overlooking negative developments are equally significant. Such misjudgments can impede a company's refinancing options, lead to missed investment prospects for investors, and result in financial losses. In the following code, we evaluate how aggregated features from LinkedIn help to improve the quality of prediction of a downgrade. Three data frames are used - financial metrics only, LinkedIn metrics only and both combined. The evaluation is considered successful if a positive influence of the LinkedIn features on the prediction can be determined. AUC and recall are considered particularly relevant metrics. Details can be found in the Data chapter of the corresponding master thesis.

# 2. Load data and prepare libaries

With the use of Chat GPD, comments have been added for readability.

## 2.1 Import libaries

In [None]:
import os
import re
import numpy as np
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor
from prettytable import PrettyTable
from imblearn.over_sampling import ADASYN
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
import xgboost as xgb
from xgboost import plot_importance, plot_tree, XGBClassifier, XGBRegressor
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
from fuzzywuzzy import fuzz
import warnings
warnings.filterwarnings("ignore")
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import roc_auc_score

## 2.2 Load datasets

In [None]:
dateipfad = r'C:\Users\chiar\OneDrive\Masterthesis\Modell\List of companys_onetemplate_downgrade.xls'
df_up = pd.read_excel(dateipfad)
df_up.head()

In [None]:
dateipfad = r'C:\Users\chiar\OneDrive\Masterthesis\Modell\df_waf_final.csv'
df_waf_rfm = pd.read_csv(dateipfad, sep=';')
df_waf_rfm.head()

In [None]:
df_waf_rfm.shape

Author knowledge: in the generation of df_waf_rfm initialisation values were used. They are droped from the dataframe before the merge.

In [None]:
# Filtere die Zeilen mit dem Wert 1 in der Spalte "Number of Employee 2014"
filtered_df = df_waf_rfm[df_waf_rfm['Number of employees 2014'] == 1]
filtered_df

In [None]:
filtered_df.shape

In [None]:
df_waf_rfm = df_waf_rfm[df_waf_rfm['Number of employees 2014'] != 1]

In [None]:
dateipfad = r'C:\Users\chiar\OneDrive\Masterthesis\Modell\Matching.csv'
df_match = pd.read_csv(dateipfad, sep=';')
df_match.head()

## 2.3 Merge Datasets

Cleaning Company name to make a match with firm_original_name possible.

In [None]:
# Copy the column "Company Name" to the new column "Copy_Company_Name" in DataFrame df_up
df_up['Copy_Company_Name'] = df_up['Company Name']

# Remove values in parentheses from the "Company Name" column in DataFrame df_up
df_up['Company Name'] = df_up['Company Name'].apply(lambda x: re.sub(r'\(.*\)', '', str(x)).strip())

# Print the DataFrame df_up after the modifications
df_up

In [None]:
# Function for cleaning Company Name by removing non-alphanumeric characters
def clean_company_name(name):
    return re.sub(r'[^\w\s]', '', str(name))

# Clean the Company Name column using the clean_company_name function in DataFrame df_up
df_up['Company Name'] = df_up['Company Name'].apply(clean_company_name)
df_up

In [None]:
# Function for fuzzy matching to find the best match for each company name
def find_best_match(company_name, reference_names):
    best_match = None
    best_similarity = 0

    for ref_name in reference_names:
        similarity = fuzz.token_set_ratio(company_name, ref_name)
        if similarity > best_similarity:
            best_match = ref_name
            best_similarity = similarity

    # Threshold for similarity score (adjust as needed)
    threshold_similarity = 95

    # Return the best match if similarity score is above the threshold, else return None
    return best_match if best_similarity >= threshold_similarity else None

# Create an empty list to store the matched companies
matched_companies = []

# Extract company names from df_up
company_names_up = df_up['Company Name'].tolist()

# Extract firm_original_names from df_waf_rfm
firm_original_names_waf = df_waf_rfm['Firm_original_name'].tolist()

# Iterate over the company names in df_up
for company_name_up in company_names_up:
    # Find the best match for the current company name in df_up within df_waf_rfm
    best_match_waf = find_best_match(company_name_up, firm_original_names_waf)
    
    # Append the match result to the matched_companies list
    matched_companies.append((company_name_up, best_match_waf))

# Convert the matched_companies list to a DataFrame
results_matching = pd.DataFrame(matched_companies, columns=['Company Name Up', 'Best Match in df_waf_rfm'])

# Display the results
results_matching


In [None]:
# Count the number of entries where the Best Match is None in results_matching
num_none_matches = results_matching['Best Match in df_waf_rfm'].isna().sum()

# Display the result
print("Number of entries with 'None' in Best Match:", num_none_matches)

In [None]:
# Step 1: Merge df_up with df_waf_rfm using the results_matching as the merge key
df_up_merged = pd.merge(df_up, results_matching, left_on='Company Name', right_on='Company Name Up', how='left')

# Step 2 and 3: Iterate over the Company Names in df_up and search in results_matching
for index_up, row_up in df_up.iterrows():
    company_name_up = row_up['Company Name']
    
    # Step 4: Check if the Company Name in results_matching is None
    best_match_waf = results_matching.loc[results_matching['Company Name Up'] == company_name_up, 'Best Match in df_waf_rfm'].values[0]
    if pd.isna(best_match_waf):
        # Step 5: If None is found, fill None in the previously added columns from df_waf_rfm
        df_up_merged.loc[index_up, df_waf_rfm.columns] = None
    else:
        # Step 6: If a match is found, extract the row from df_waf_rfm and merge the entries to df_up_merged
        row_waf = df_waf_rfm.loc[df_waf_rfm['Firm_original_name'] == best_match_waf]
        df_up_merged.loc[index_up, df_waf_rfm.columns] = row_waf.values[0]

df_up_merged

In [None]:
# Count the number of entries where the Number of employees 2014 is NaN in df_up_merged
num_nan_employees = df_up_merged['Number of employees 2014'].isna().sum()

# Display the result
print("Number of entries with NaN in Number of employees 2014:", num_nan_employees)

NaN-values in the "Number of employee"-fields show, that there are no LinkedIn data available. Therefore they are droped. 

In [None]:
# Drop rows with NaN in the "Number of employees 2014" column in df_up_merged
df_up_merged.dropna(subset=['Number of employees 2014'], inplace=True)

# Reset the index after dropping rows
df_up_merged.reset_index(drop=True, inplace=True)
df_up_merged.head(2)

In [None]:
df_up_merged.shape

Matching was successful in df_up_merged.

# 3. Data Preperation

During data preparation, the data are first checked in general (3.1). Then missing values (3.1.1), duplicates (3.1.2), non-numeric columns (3.1.4) and the distribution of the target variables are checked (3.1.5). Due to the data type, data outliers can only be checked downstream. Therefore, an initial data cleaning is carried out in 3.2. Columns that are not needed are removed (3.2.1), the data type is corrected (3.2.2), the column country (3.2.3) and industry (3.2.4) are cleaned. On this basis, the data outliers in 3.3.1 can be examined. Subsequently, the content-related data distribution is checked (3.3.2, 3.3.3). The final data cleaning is carried out in chapter 3.4. Values that should not be considered are removed (3.4.1, 3.4.2), coding is done where necessary (3.4.3), empty values are handled (3.4.4). The data are put into a time series format in 3.5. Finally, collinarity and multicollinarity are checked (3.6).

## 3.1 Data Inspection

In [None]:
df_up_merged.head(10)

The following column do not add value to the context and are therefor not needed:
- Adress
- S&P Entity ID
- Excel Company ID
- Index Constituents [Secondary Listings]
- S&P Entity Credit Rating - Issuer Credit Rating - Local Currency LT [Latest] (Rating)
- &P Entity Credit Rating Date - Issuer Credit Rating - Local Currency LT [Latest] (Rating Date)
- S&P Entity Credit Rating - Issuer Credit Rating - Foreign Currency LT [Latest] (Rating)
- S&P Entity Credit Rating Date - Issuer Credit Rating - Foreign Currency LT [Latest] (Rating Date)
- S&P Entity Credit Rating - Issuer Credit Rating - Local Currency LT [Latest] (CreditWatch)
- S&P Entity Credit Rating Date - Issuer Credit Rating - Foreign Currency LT [Latest] (Rating Date)	
- S&P Entity Credit Rating - Issuer Credit Rating - Local Currency LT [Latest] (CreditWatch)
- S&P Entity Credit Rating Date - Issuer Credit Rating - Local Currency LT [Latest] (CreditWatch Date)
- S&P Entity Credit Rating - Issuer Credit Rating - Local Currency LT [Latest] (Outlook)
- S&P Entity Credit Rating Date - Issuer Credit Rating - Foreign Currency LT [Latest] (Outlook Date)
- the author decided to focus on the timeseries 2014-2018. Therefore the values for 2013 and >2018 can be deleted.

In [None]:
df_up_merged.describe()

In [None]:
df_up_merged.info()

In [None]:
# Determining the size and scope of the data set
print('The dataset has {} rows and {} columns. This results in {} data entries.'.format(df_up_merged.shape[0],df_up_merged.shape[1], df_up_merged.size)) 

In [None]:
# Analyze the data types of columns in df_up_merged
column_data_types = df_up_merged.dtypes

# Set the option to display all rows and columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Display the result
print("Data Types of Columns in df_up_merged:")
print(column_data_types.to_string())

Apart from the first 11 columns, the other entries are numbers. These must be converted into float values.

### 3.1.1 Checking for missing values

In [None]:
# Set the option to display all columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Check for null values in df_up_merged
null_counts = df_up_merged.isnull().sum()

# Display the result
print("Number of null values in each column of df_up_merged:")
print(null_counts)

The following columns contain highest Number of None/NaN fields that need to be cleaned:
- Gross Profit/ Employee 2018
- All Rating and changes in Rating
- Consider dropping companys that have missing values in financials.

Columns are included that are no longer needed and contain some empty values. Author knowledge is beeing used. These are: 
- Rating 2018 ALT 
- Rating 2012
- Change 2012/2013 (does not concern analysis period)
- New joining work experience 2014 (empty)
- Number of Notices 2018
- Number of notices 2018 (empty due to missing values in 2019)
- Number of New Joiners 2014 (authors knowledge)

### 3.1.2 Checking for dublicates

In [None]:
duplicates = df_up_merged[df_up_merged.duplicated()]
print("Duplicate Rows : ",len(duplicates))
duplicates

As expected there are no dublicates in this dataframe. No cleaning nessercary.

### 3.1.3 Checking for data outliers

The checking for data outliers is done later in this notebook. Most columns needs to be converted to a processable formate for numbers.

### 3.1.4 Inspecting non-numerical columns 

In [None]:
df_up_merged['Geographic Region'].head()

In [None]:
unique_region_values = df_up_merged['Geographic Region'].unique()
unique_region_values

In [None]:
df_up_merged['Land'].head()

In [None]:
unique_country_values = df_up_merged['Land'].unique()
unique_country_values

Correlation between Geografic Region an Country expected. Geographic Region contains less information and should be droped if needed. Values in country needs to be cleaned since there are the same letter in capital and small letters.

In [None]:
df_up_merged['Exchange'].head()

In [None]:
unique_exchange_values = df_up_merged['Exchange'].unique()
unique_exchange_values

INFO: 
- OM: Nasdaq OMX Nordic, a stock exchange in Sweden, Denmark, Finland, and Iceland;
- SWX: SIX Swiss Exchange, the Swiss stock exchange;
- NYSE: New York Stock Exchange, the stock exchange in New York City, USA;
- ENXTPA: Euronext Paris, the French stock exchange;
- NasdaqGS: Nasdaq Global Select Market, a US-based stock exchange, part of the Nasdaq Stock Market;
- XTRA: Frankfurt Stock Exchange, the stock exchange in Frankfurt, Germany;
- ENXTAM: Euronext Amsterdam, the Dutch stock exchange;
- BME: Bolsas y Mercados Españoles, the stock exchange in Spain;
- LSE: London Stock Exchange, the stock exchange in London, United Kingdom;
- ENXTBR: Euronext Brussels, the stock exchange in Belgium;
- BIT: Borsa Italiana, the stock exchange in Italy;
- ISE: Irish Stock Exchange, the stock exchange in Ireland;
- CPSE: Euronext Lisbon, the stock exchange in Portugal;
- WBAG: Wiener Börse AG, the stock exchange in Austria;
- OB: Oslo Børs, the stock exchange in Norway;
- HLSE: Helsinki Stock Exchange, the stock exchange in Finland.

In [None]:
#Counting the number of Tickers. They act as unique identifier per company and should be kept. 
unique_Ticker_count = df_up_merged['Ticker'].nunique()
unique_Ticker_count

No cleaning of column ticker needed from a subject specific point of view.

In [None]:
unique_industry_values = df_up_merged['S&P RatingsDirect® Industry'].unique()
unique_industry_values

Cleaning tasks: "Corporates; Industrials" is a pre configuration and can be droped. The main industry following in the breakdown is the intresting one and needs to be keept. All the other information are considered details and should be droped. Also rename the column to "Industry".

In [None]:
# List of columns to check for unique values
columns_to_check = ['Rating 2014', 'Rating 2015', 'Rating 2016', 'Rating 2017', 'Rating 2018', 'Rating 2019']

for column in columns_to_check:
    # Get the unique values in the specified column
    unique_values = df_up_merged[column].unique()

    # Print the unique values for the current column
    print("Unique values for " + column + ":")
    print(unique_values)

Variables need to be converted to kategorial features to use them in futher analysis.

###  3.1.5 Checking for target variable downgrade

In [None]:
# List of columns to check for unique values
columns_to_check = ["Change 2013/14", "Change 2015/16", "Change 2016/17", "Change 2017/18", "Change 2018/19"]

for column in columns_to_check:
    # Get the unique values in the specified column
    unique_values = df_up_merged[column].unique()

    # Print the unique values for the current column
    print("Unique values for " + column + ":")
    print(unique_values)

In [None]:
downgrade_2017_18_count = df_up_merged["Change 2017/18"].value_counts()["down grade"]
downgrade_2018_19_count = df_up_merged["Change 2018/19"].value_counts()["down grade"]

print("Number 'down grade' Change 2017/18:", downgrade_2017_18_count)
print("Number 'down grade' Change 2018/19:", downgrade_2018_19_count)


Poor database of down grades. For the trainings data syntetic data are going to be needed.

One Hot Encoding for 2018 an 2017 necessary. Numerical coding is not chosen because it is the target variable and a binary expression is more useful here.

## 3.2 First data cleansing to enable deeper Data inspection

In [None]:
#Copy for better work contorl
df_up = df_up_merged.copy()

### 3.2.1 Droping columns that are not needed or empty

In [None]:
# List of columns to remove
columns_to_remove = [
    "Rating 2018 ALT",
    "Rating 2012",
    "Change 2012/13",
    "New joining work experience 2014",
    "Migrating work experience 2018",
    "Number of Notices 2018",
    "Number of notices 2018", 
    "Number of New Joiners 2014",
    'Adress',
    'S&P Entity ID',
    'Excel Company ID',
    'Index Constituents [Secondary Listings]',
    'Index Constituents [Primary Listing]',
    'S&P Entity Credit Rating - Issuer Credit Rating - Local Currency LT [Latest] (Rating)',
    'S&P Entity Credit Rating Date - Issuer Credit Rating - Local Currency LT [Latest] (Rating Date)',
    'S&P Entity Credit Rating - Issuer Credit Rating - Foreign Currency LT [Latest] (Rating)',
    'S&P Entity Credit Rating Date - Issuer Credit Rating - Foreign Currency LT [Latest] (Rating Date)',
    'S&P Entity Credit Rating - Issuer Credit Rating - Local Currency LT [Latest] (CreditWatch)',
    'S&P Entity Credit Rating Date - Issuer Credit Rating - Foreign Currency LT [Latest] (Rating Date)',
    'S&P Entity Credit Rating - Issuer Credit Rating - Local Currency LT [Latest] (CreditWatch)',
    'S&P Entity Credit Rating Date - Issuer Credit Rating - Local Currency LT [Latest] (CreditWatch Date)',
    'S&P Entity Credit Rating - Issuer Credit Rating - Local Currency LT [Latest] (Outlook)',
    'S&P Entity Credit Rating Date - Issuer Credit Rating - Foreign Currency LT [Latest] (Outlook Date)',
    "Market Capitalization [12/31/2013] (€EURmm, Historical rate)",
    "Market Capitalization [My Setting] [12/31/2019] (€EURmm, Historical rate)",
    "Market Capitalization [My Setting] [12/31/2020] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2013] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2019] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2020] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2013] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2019] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2020] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2013] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2019] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2020] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2013] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2019] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2020] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2013] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2019] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2020] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2013] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2019] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2020] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2013] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2019] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2020] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2013] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2019] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2020] (€EURmm, Historical rate)",
    "Total Employees - Capital IQ [CY 2013]",
    "Total Employees - Capital IQ [CY 2019]",
    "Total Employees - Capital IQ [CY 2020]",
    "Cash from Ops. - Capital IQ [CY 2013] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2019] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2020] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2013] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2019] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2020] (€EURmm, Historical rate)",
    "Copy_Company_Name", "Company Name Up", "Best Match in df_waf_rfm", "Firm_original_name",
    "Rating 2020",
    "Rating 2021",
    "Rating 2022",
    "Rating 2023",
    "Change 2019/20",
    "Change 2020/21",
    "Change 2021/22",
    "Change 2022/23"
]

# Drop the specified columns from the DataFrame
df_up.drop(columns=columns_to_remove, inplace=True)
df_up.head(2)

### 3.2.2 Converting columns from object to float. 

First there is a need to check for special characters (spaces, etc.)

In [None]:
def check_for_special_characters(df, columns_to_check):
    pattern = re.compile(r'[^\w\s.]')  # Define a pattern for special characters (everything except letters, numbers, spaces and full stops)
    result = []

    for column in columns_to_check:
        for index, value in df[column].items():
            if re.search(pattern, str(value)):
                result.append((index, column, value))

    if result:
        print("Folgende Sonderzeichen wurden gefunden:")
        for row in result:
            print(f"Row {row[0]}, Column {row[1]}, Value: {row[2]}")
    else:
        print("Keine Sonderzeichen in den angegebenen Spalten gefunden.")


columns_to_check = [
    "Market Capitalization [12/31/2014] (€EURmm, Historical rate)",
    "Market Capitalization [12/31/2015] (€EURmm, Historical rate)",
    "Market Capitalization [12/31/2016] (€EURmm, Historical rate)",  
    "Market Capitalization [My Setting] [12/31/2017] (€EURmm, Historical rate)",
    "Market Capitalization [My Setting] [12/31/2018] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Employees - Capital IQ [CY 2014]",
    "Total Employees - Capital IQ [CY 2015]",
    "Total Employees - Capital IQ [CY 2016]",
    "Total Employees - Capital IQ [CY 2017]",
    "Total Employees - Capital IQ [CY 2018]",
    "Cash from Ops. - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Employee development 2015",
    "Employee development 2016",
    "Employee development 2017",
    "Employee development 2018",
    "Migrating work experience 2014",
    "Migrating work experience 2015",
    "Migrating work experience 2016",
    "Migrating work experience 2017",
    "New joining work experience 2015",
    "New joining work experience 2016",
    "New joining work experience 2017",
    "New joining work experience 2018",
    "Fluctuation rate 2014",
    "Fluctuation rate 2015",
    "Fluctuation rate 2016",
    "Fluctuation rate 2017",
    "Fluctuation rate 2018",
    "More than once/different position",]

check_for_special_characters(df_up, columns_to_check)


There are negative numbers, kommas and also empty fields indicated by -. This charackters needs to be cleaned. 
Next it must be taken into account whether whole numbers are present or if we decimal numbers.

In [None]:
# Check for integers in columns
def check_for_integers(df, columns_to_check):
    integer_columns = []
    for column in columns_to_check:
        is_integer = df[column].apply(lambda x: str(x).isdigit()).all()
        if is_integer:
            integer_columns.append(column)
    return integer_columns

columns_to_convert = [   ]  
integer_columns = check_for_integers(df_up, columns_to_convert)

if integer_columns:
    print("The following columns contain integers:")
    print(integer_columns)
else:
    print("No columns with only integers were found.")

Columns can be converted to float, since dataset only contains dicomal numbers. Last the decimal separator is checked.

In [None]:
columns_to_convert = [     
    "Market Capitalization [12/31/2014] (€EURmm, Historical rate)",
    "Market Capitalization [12/31/2015] (€EURmm, Historical rate)",
    "Market Capitalization [12/31/2016] (€EURmm, Historical rate)",  
    "Market Capitalization [My Setting] [12/31/2017] (€EURmm, Historical rate)",
    "Market Capitalization [My Setting] [12/31/2018] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Employees - Capital IQ [CY 2014]",
    "Total Employees - Capital IQ [CY 2015]",
    "Total Employees - Capital IQ [CY 2016]",
    "Total Employees - Capital IQ [CY 2017]",
    "Total Employees - Capital IQ [CY 2018]",
    "Cash from Ops. - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Employee development 2015",
    "Employee development 2016",
    "Employee development 2017",
    "Employee development 2018",
    "Migrating work experience 2014",
    "Migrating work experience 2015",
    "Migrating work experience 2016",
    "Migrating work experience 2017",
    "New joining work experience 2015",
    "New joining work experience 2016",
    "New joining work experience 2017",
    "New joining work experience 2018",
    "Fluctuation rate 2014",
    "Fluctuation rate 2015",
    "Fluctuation rate 2016",
    "Fluctuation rate 2017",
    "Fluctuation rate 2018",
    "More than once/different position",]  

def check_comma_or_dot(df, columns):
    comma_columns = []
    dot_columns = []

    for column in columns:
        if df[column].str.contains(',').any():
            comma_columns.append(column)
        elif df[column].str.contains('.').any():
            dot_columns.append(column)

    return comma_columns, dot_columns

comma_columns, dot_columns = check_comma_or_dot(df_up, columns_to_convert)

print("Columns with comma:")
print(comma_columns)

print("Columns with dot:")
print(dot_columns)

To convert successful equal decimal seperators needs to be used. Therefor kommas are replaced by points.

In [None]:
columns_to_convert = ["Employee development 2015",
    "Employee development 2016",
    "Employee development 2017",
    "Employee development 2018",
    "Migrating work experience 2014",
    "Migrating work experience 2015",
    "Migrating work experience 2016",
    "Migrating work experience 2017",
    "New joining work experience 2015",
    "New joining work experience 2016",
    "New joining work experience 2017",
    "New joining work experience 2018",
    "Fluctuation rate 2014",
    "Fluctuation rate 2015",
    "Fluctuation rate 2016",
    "Fluctuation rate 2017",
    "Fluctuation rate 2018",
    "More than once/different position",]

# Replace commas with dots in the relevant columns
for column in columns_to_convert:
    df_up[column] = df_up[column].str.replace(',', '.')

# Print
df_up.head(5)

The - accounting fo an empty value are converted to NaN. 

In [None]:
# In some columns there are - as empty values. Those need to be replaced bevor we can convert to float.
columns_to_convert = [
    "Market Capitalization [12/31/2014] (€EURmm, Historical rate)",
    "Market Capitalization [12/31/2015] (€EURmm, Historical rate)",
    "Market Capitalization [12/31/2016] (€EURmm, Historical rate)",  
    "Market Capitalization [My Setting] [12/31/2017] (€EURmm, Historical rate)",
    "Market Capitalization [My Setting] [12/31/2018] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Employees - Capital IQ [CY 2014]",
    "Total Employees - Capital IQ [CY 2015]",
    "Total Employees - Capital IQ [CY 2016]",
    "Total Employees - Capital IQ [CY 2017]",
    "Total Employees - Capital IQ [CY 2018]",
    "Cash from Ops. - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2018] (€EURmm, Historical rate)",
]

# Replace the "-" character with NaN (Not-a-Number) in the relevant columns
for column in columns_to_convert:
    df_up[column] = df_up[column].replace('-', float('nan'))

Lastly the columns can be converted to float.

In [None]:
# List of columns to convert to float and replace "object" values with NaN
columns_to_convert = [
    "Market Capitalization [12/31/2014] (€EURmm, Historical rate)",
    "Market Capitalization [12/31/2015] (€EURmm, Historical rate)",
    "Market Capitalization [12/31/2016] (€EURmm, Historical rate)",  
    "Market Capitalization [My Setting] [12/31/2017] (€EURmm, Historical rate)",
    "Market Capitalization [My Setting] [12/31/2018] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Employees - Capital IQ [CY 2014]",
    "Total Employees - Capital IQ [CY 2015]",
    "Total Employees - Capital IQ [CY 2016]",
    "Total Employees - Capital IQ [CY 2017]",
    "Total Employees - Capital IQ [CY 2018]",
    "Cash from Ops. - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Employee development 2015",
    "Employee development 2016",
    "Employee development 2017",
    "Employee development 2018",
    "Migrating work experience 2014",
    "Migrating work experience 2015",
    "Migrating work experience 2016",
    "Migrating work experience 2017",
    "New joining work experience 2015",
    "New joining work experience 2016",
    "New joining work experience 2017",
    "New joining work experience 2018",
    "Fluctuation rate 2014",
    "Fluctuation rate 2015",
    "Fluctuation rate 2016",
    "Fluctuation rate 2017",
    "Fluctuation rate 2018",
    "More than once/different position",
]

def convert_to_float_with_negatives(value):
    try:
        # Attempts to convert the value to a float
        return float(value)
    except ValueError:
        # If the value cannot be converted into a float (e.g. if there is a minus sign in front of a number), return the value unchanged
        return value

# Convert the columns to the data type "float" and keep the negative values
for column in columns_to_convert:
    df_up[column] = df_up[column].apply(convert_to_float_with_negatives)

# Print
df_up.head(5)


Check if converting was successful:

In [None]:
# Analyze the data types of columns in df_up_merged
column_data_types = df_up.dtypes

# Set the option to display all rows and columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Display the result
print("Data Types of Columns in df_up:")
print(column_data_types.to_string())

### 3.2.3 Cleaning column country

In [None]:
# Using only capital letters
df_up["Land"] = df_up["Land"].str.upper()

# checking unique values
unique_land_values = df_up["Land"].unique()
unique_land_values

In [None]:
# Using only capital letters
df_up["Land"] = df_up["Land"].str.upper()

# Convert the 'Land' column to string data type
df_up["Land"] = df_up["Land"].astype(str)

# checking unique values
unique_land_values = df_up["Land"].unique()
unique_land_values


### 3.2.4 Unify values in Industies

In [None]:
df_up.rename(columns={"S&P RatingsDirect® Industry": "Industry"}, inplace=True)
df_up.head(1)

In [None]:
# Step 1: Remove "Corporates; Industrials;" from the entries in the "Industry" column
df_up['Industry'] = df_up['Industry'].str.replace('Corporates; Industrials;', '', regex=False)

# Step 2: Remove all words after the first semicolon in the "Industry" column
df_up['Industry'] = df_up['Industry'].str.split(';').str[0]

# Display unique values in the "Industry" column
unique_industries = df_up['Industry'].unique()
unique_industries

### 3.2.5 One Hot Encoding for target variable

In [None]:
# Fill NaN values in the "Change 2018/19" column with "No Change" (there are just seven)
df_up['Change 2018/19'] = df_up['Change 2018/19'].fillna('No Change')

In [None]:
# Count the number of NaN values in the "Change 2018/19" column
nan_count_change_2018_19 = df_up['Change 2018/19'].isna().sum()
nan_count_change_2018_19

In [None]:
# Fill NaN values in the "Change 2018/19" column with "No Change" (there are just seven)
df_up['Change 2017/18'] = df_up['Change 2017/18'].fillna('No Change')

In [None]:
# Count the number of NaN values in the "Change 2017/18" column
nan_count_change_2017_18 = df_up['Change 2017/18'].isna().sum()
nan_count_change_2017_18 

Checking if values are unify.

In [None]:
# Get the unique values in the "Change 2018/19" column
unique_values_change_2018_19 = df_up['Change 2018/19'].unique()
unique_values_change_2018_19 

In [None]:
# Get the unique values in the "Change 2018/19" column
unique_values_change_2017_18 = df_up['Change 2017/18'].unique()
unique_values_change_2017_18 

In [None]:
# Replace "No Change" with "no change" in the "Change 2018/19" column
df_up['Change 2018/19'] = df_up['Change 2018/19'].replace('No Change', 'no change')
df_up['Change 2017/18'] = df_up['Change 2017/18'].replace('No Change', 'no change')

In [None]:
# Get the unique values in the "Change 2018/19" column
unique_values_change_2018_19 = df_up['Change 2018/19'].unique()
unique_values_change_2018_19 

In [None]:
# Perform one-hot encoding for the "Change 2018/19" column
df_up = pd.get_dummies(df_up, columns=['Change 2018/19'], prefix='Change 2018 19')

# Perform one-hot encoding for the "Change 2017/18" column
df_up = pd.get_dummies(df_up, columns=['Change 2017/18'], prefix='Change 2017 18')

# Remove the desired columns for "Change 2018/19"
columns_to_remove_2018 = ['Change 2018 19_first rating', 'Change 2018 19_no change', 'Change 2018 19_up grade']
df_up.drop(columns=columns_to_remove_2018, inplace=True)

# Remove the desired columns for "Change 2017/18"
columns_to_remove_2017 = ['Change 2017 18_first rating', 'Change 2017 18_no change', 'Change 2017 18_up grade']
df_up.drop(columns=columns_to_remove_2017, inplace=True)

# Rename the column "Change 2018 19_down grade" to "Downgrade 2018"
df_up.rename(columns={'Change 2018 19_down grade': 'Downgrade 2018'}, inplace=True)

# Rename the column "Change 2017 18_down grade" to "Downgrade 2017"
df_up.rename(columns={'Change 2017 18_down grade': 'Downgrade 2017'}, inplace=True)

# Assign the modified DataFrame back to df_up_wt
df_up_wt = df_up

# Display the resulting DataFrame
df_up_wt.head(2)

## 3.3 Deeper data inspection

In [None]:
# Copy for better work control / df_pp = pre proccesed
df_pp = df_up_wt.copy()

### 3.3.1 Checking for data outliers

Note: Data outliers are checked in gruops to confirm, that there are no obvious errors in the data. Due to the nature of the domain it is not absolutly nessercary to clean the outliers - especially since the source of the financials is Bloomberg, wich accounts as a reliable source.

In [None]:
selected_columns = [
    "Market Capitalization [12/31/2014] (€EURmm, Historical rate)",
    "Market Capitalization [12/31/2015] (€EURmm, Historical rate)",
    "Market Capitalization [12/31/2016] (€EURmm, Historical rate)",
    "Market Capitalization [My Setting] [12/31/2017] (€EURmm, Historical rate)",
    "Market Capitalization [My Setting] [12/31/2018] (€EURmm, Historical rate)",
]

plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

df_pp[selected_columns].boxplot()
plt.title("Market Capitalization Boxplots")
plt.ylabel("Market Capitalization (€EURmm)")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

In [None]:
selected_columns = [
    "EBITDA - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2018] (€EURmm, Historical rate)",
]

plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

df_pp[selected_columns].boxplot()
plt.title("EBITDA Boxplots")
plt.ylabel("Market Capitalization (€EURmm)")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

In [None]:
selected_columns = [
    "EBIT - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2018] (€EURmm, Historical rate)",
]

plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

df_pp[selected_columns].boxplot()
plt.title("EBIT Boxplots")
plt.ylabel("Market Capitalization (€EURmm)")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

In [None]:
selected_columns = [
    "Net Income - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2018] (€EURmm, Historical rate)",
]

plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

df_pp[selected_columns].boxplot()
plt.title("Net Income Boxplots")
plt.ylabel("Market Capitalization (€EURmm)")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

In [None]:
selected_columns = [
    "Total Equity - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2018] (€EURmm, Historical rate)",
]

plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

df_pp[selected_columns].boxplot()
plt.title("Total Equity Boxplots")
plt.ylabel("Market Capitalization (€EURmm)")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

In [None]:
selected_columns = [
	"Total Debt - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2018] (€EURmm, Historical rate)",
]

plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

df_pp[selected_columns].boxplot()
plt.title("Total Debt Boxplots")
plt.ylabel("Market Capitalization (€EURmm)")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

In [None]:
selected_columns = [
    "Total Assets - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2018] (€EURmm, Historical rate)",
]

plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

df_pp[selected_columns].boxplot()
plt.title("Total Assets Boxplots")
plt.ylabel("Market Capitalization (€EURmm)")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

In [None]:
selected_columns = [
    "Net Debt - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2018] (€EURmm, Historical rate)",
]

plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

df_pp[selected_columns].boxplot()
plt.title("Total Debt Boxplots")
plt.ylabel("Market Capitalization (€EURmm)")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

In [None]:
selected_columns = [
    "Gross Profit - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2018] (€EURmm, Historical rate)",
]

plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

df_pp[selected_columns].boxplot()
plt.title("Gross Profit Boxplots")
plt.ylabel("Market Capitalization (€EURmm)")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

In [None]:
selected_columns = [
    "Total Employees - Capital IQ [CY 2014]",
    "Total Employees - Capital IQ [CY 2015]",
    "Total Employees - Capital IQ [CY 2016]",
    "Total Employees - Capital IQ [CY 2017]",
    "Total Employees - Capital IQ [CY 2018]",
]

plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

df_pp[selected_columns].boxplot()
plt.title("Total Employees Boxplots")
plt.ylabel("Market Capitalization (€EURmm)")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

In [None]:
selected_columns = [
    "Cash from Ops. - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2018] (€EURmm, Historical rate)",
]

plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

df_pp[selected_columns].boxplot()
plt.title("Cash from Ops Boxplots")
plt.ylabel("Market Capitalization (€EURmm)")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

In [None]:
selected_columns = [
    "Total Revenue - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2018] (€EURmm, Historical rate)",
]

plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

df_pp[selected_columns].boxplot()
plt.title("Total Revenue Boxplots")
plt.ylabel("Market Capitalization (€EURmm)")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

In [None]:
selected_columns = [
 "Employee development 2015",
    "Employee development 2016",
    "Employee development 2017",
    "Employee development 2018",
]

plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

df_pp[selected_columns].boxplot()
plt.title("Employee development Boxplots")
plt.ylabel("Market Capitalization (€EURmm)")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

In [None]:
selected_columns = [
  "Migrating work experience 2014",
    "Migrating work experience 2015",
    "Migrating work experience 2016",
    "Migrating work experience 2017",
]

plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

df_pp[selected_columns].boxplot()
plt.title("Migrating work experience Boxplots")
plt.ylabel("Market Capitalization (€EURmm)")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

In [None]:
selected_columns = [
    "New joining work experience 2015",
    "New joining work experience 2016",
    "New joining work experience 2017",
    "New joining work experience 2018",
]

plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

df_pp[selected_columns].boxplot()
plt.title("New joining work experience Boxplots")
plt.ylabel("Market Capitalization (€EURmm)")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

In [None]:
selected_columns = [
  "Fluctuation rate 2014",
    "Fluctuation rate 2015",
    "Fluctuation rate 2016",
    "Fluctuation rate 2017",
    "Fluctuation rate 2018"
]

plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

df_pp[selected_columns].boxplot()
plt.title("Fluctuation rate Boxplots")
plt.ylabel("Market Capitalization (€EURmm)")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

Form a statistical point of view I would use at least a 98 % quantil.From a professional point of view most data outliers make sense...

In [None]:
selected_columns = [
"More than once/different position"
]

plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

df_pp[selected_columns].boxplot()
plt.title("Serveral positions Boxplots")
plt.ylabel("Number of people who worked there in more than one position")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

### 3.3.2 Data allocation with respect to the target variable

In [None]:
# Count the entries for "Downgrade 2017"
downgrade_2017_counts = df_pp['Downgrade 2017'].value_counts()
print("Downgrade 2017:")
print(downgrade_2017_counts)

# Count the entries for "Downgrade 2018"
downgrade_2018_counts = df_pp['Downgrade 2018'].value_counts()
print("\nDowngrade 2018:")
print(downgrade_2018_counts)

In [None]:
import matplotlib.pyplot as plt

# Calculate the value counts of "Downgrade 2017"
value_counts_2017 = downgrade_2017_counts

# Calculate the value counts of "Downgrade 2018"
value_counts_2018 = downgrade_2018_counts

# Set up the figure and create two subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))

# Pie chart for "Downgrade 2017"
labels_2017 = ['1', '0']
values_2017 = [value_counts_2017.get(1, 0), value_counts_2017.get(0, 0)]
explode_2017 = [0.1, 0]
ax1.pie(values_2017, labels=labels_2017, autopct='%1.2f%%', explode=explode_2017)
ax1.set_aspect('equal')  # Equal aspect ratio ensures that the pie is drawn as a circle.
ax1.set_title('Distribution of Downgrade 2017')

# Pie chart for "Downgrade 2018"
labels_2018 = ['1', '0']
values_2018 = [value_counts_2018.get(1, 0), value_counts_2018.get(0, 0)]
explode_2018 = [0.1, 0]
ax2.pie(values_2018, labels=labels_2018, autopct='%1.2f%%', explode=explode_2018)
ax2.set_aspect('equal')  # Equal aspect ratio ensures that the pie is drawn as a circle.
ax2.set_title('Distribution of Downgrade 2018')

# Display the pie charts
plt.show()

After Split in test & train data the train data should be oversampled using the SMOTE technique.

### 3.2.3 Checking distribution in dataset

In [None]:
# Count the occurrences of each country in the 'Land' column
country_counts = df_pp['Land'].value_counts()

# Create a pie chart to visualize the distribution
plt.figure(figsize=(8, 8))
plt.pie(country_counts, labels=country_counts.index, autopct='%1.1f%%')
plt.title('Distribution of Companies by Country')
plt.show()

Most companies are from the USA. Second biggest group is GB, followed by Switzerland.

In [None]:
# Count the occurrences of each country in the 'Land' column
country_counts = df_pp['Industry'].value_counts()

# Create a DataFrame to store the counts and percentage
country_distribution = pd.DataFrame({'Industry': country_counts.index, 'Count': country_counts.values})

# Calculate the percentage of each country in the 'Land' column
total_countries = len(df_pp['Industry'])
country_distribution['Percentage'] = (country_distribution['Count'] / total_countries) * 100

# Sort the DataFrame by count in descending order
country_distribution = country_distribution.sort_values(by='Count', ascending=False)

# Display the tabular view of the distribution
print(country_distribution)

Insurances should not be contained and need to be removed. The focus of research are corporate companies.

In [None]:
# Function to calculate percentages for different years
def calculate_percentage(row, year):
    total_employees = row["Total Employees - Capital IQ [CY %d]" % year]
    employees = row["Number of employees %d" % year]
    
    # Calculate percentage if not NaN and total employees is not zero
    percentage = (employees / total_employees) * 100 if (not pd.isna(total_employees) and total_employees != 0) else None
    
    return percentage

# List of years to calculate percentages for
years = [2014, 2015, 2016, 2017, 2018]

# Calculate percentages for each year and apply the function to the DataFrame rows
for year in years:
    col_name = "Percentage of employees on Linkedin %d" % year
    df_pp[col_name] = df_pp.apply(calculate_percentage, axis=1, args=(year,))

# Displaying the results
output_df = df_pp[["Company Name"] + ["Percentage of employees on Linkedin %d" % year for year in years]]
output_df


In [None]:
# Define the bins for percentage ranges
bins = [0, 5, 10, 15, 20, float('inf')]  # The last bin represents 20% or more

# Define labels for the bins
labels = ['<5%', '5-10%', '10-15%', '15-20%', '20%+']

# Create a new column with bins
output_df['Percentage Range 2017'] = pd.cut(output_df['Percentage of employees on Linkedin 2017'], bins=bins, labels=labels, right=False)

# Count the occurrences in each bin
percentage_counts = output_df['Percentage Range 2017'].value_counts()

# Display the result
percentage_counts

In [None]:
# Define the bins for percentage ranges
bins = [0, 5, 10, 15, 20, float('inf')]  # The last bin represents 20% or more

# Define labels for the bins
labels = ['<5%', '5-10%', '10-15%', '15-20%', '20%+']

# Create a new column with bins
output_df['Percentage Range 2018'] = pd.cut(output_df['Percentage of employees on Linkedin 2018'], bins=bins, labels=labels, right=False)

# Count the occurrences in each bin
percentage_counts = output_df['Percentage Range 2018'].value_counts()

# Display the result
percentage_counts

In [None]:
# Filter the rows where the value in "Downgrade 2017" column is 1
filtered_rows_2017 = df_pp[df_pp['Downgrade 2017'] == 1]

# Filter the rows where the value in "Downgrade 2018" column is 1
filtered_rows_2018 = df_pp[df_pp['Downgrade 2018'] == 1]

# Combine the filtered rows for both years
filtered_rows_combined = pd.concat([filtered_rows_2017, filtered_rows_2018])

# Extract the "Company Name" from the filtered rows
company_names = filtered_rows_combined['Company Name']

# Filter and display the corresponding rows in "output_df" based on the "Company Name" values
result_df = output_df[output_df['Company Name'].isin(company_names)]
result_df.head(10)

In [None]:
# Define the bins for percentage ranges
bins = [0, 5, 10, 15, 20, float('inf')]  # The last bin represents 20% or more

# Define labels for the bins
labels = ['<5%', '5-10%', '10-15%', '15-20%', '20%+']

# Create a new column with bins
result_df['Percentage Range 2018'] = pd.cut(result_df['Percentage of employees on Linkedin 2018'], bins=bins, labels=labels, right=False)

# Count the occurrences in each bin
percentage_counts = result_df['Percentage Range 2018'].value_counts()

# Display the result
percentage_counts

In [None]:
columns_to_remove = ['Percentage of employees on Linkedin 2018', 'Percentage of employees on Linkedin 2014', 'Percentage of employees on Linkedin 2015', 'Percentage of employees on Linkedin 2016', 'Percentage of employees on Linkedin 2017']
df_pp = df_pp.drop(columns_to_remove, axis=1)
df_pp.head(2)

Defaultet companys show a rather low percentage of employees on LinkedIn. Distribution doesnt change over the year.

Majority of the companies has a percentage below 5%. Take into account, that there are no employee numbers for around 60-70 companies, wich results in 0. The reduction can be explained by the general reduction in the data in 2018. The LinkedIn dataset was probably retrieved during 2018.

## 3.4 Final data cleansing

In [None]:
df_pp2 = df_pp.copy()

### 3.4.1 Cleaning data outliers

This feature counts the numberr of people who have worked in diffrent position in the company. Even though 120.000 might be realistic in bigger firms, it is cleaned here.

In [None]:
# Assuming df_pp2 is your DataFrame
quantile_98 = df_pp2['More than once/different position'].quantile(0.98)

# Filter the DataFrame to keep only values up to the 98% quantile
df_pp2 = df_pp2[df_pp2['More than once/different position'] <= quantile_98]

In [None]:
selected_columns = [
"More than once/different position"
]

plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

df_pp2[selected_columns].boxplot()
plt.title("Serveral positions Boxplots")
plt.ylabel("Number of people who worked there in more than one position")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

### 3.4.2 Removing Industry Insurance

In [None]:
# Assuming df_pp2 is your DataFrame
df_pp2 = df_pp2[df_pp2['Industry'] != 'Insurance']

In [None]:
# Count the occurrences of each country in the 'Land' column
country_counts = df_pp2['Industry'].value_counts()

# Create a DataFrame to store the counts and percentage
country_distribution = pd.DataFrame({'Industry': country_counts.index, 'Count': country_counts.values})

# Calculate the percentage of each country in the 'Land' column
total_countries = len(df_pp2['Industry'])
country_distribution['Percentage'] = (country_distribution['Count'] / total_countries) * 100

# Sort the DataFrame by count in descending order
country_distribution = country_distribution.sort_values(by='Count', ascending=False)

# Display the tabular view of the distribution
print(country_distribution)

Removal of Incurance successful. Checking effect on target variable:

In [None]:
# Count the entries for "Downgrade 2017"
downgrade_2017_counts = df_pp['Downgrade 2017'].value_counts()
print("Downgrade 2017:")
print(downgrade_2017_counts)

# Count the entries for "Downgrade 2018"
downgrade_2018_counts = df_pp['Downgrade 2018'].value_counts()
print("\nDowngrade 2018:")
print(downgrade_2018_counts)

### 3.4.3 Putting Rating in kategorial values 

The ratings need to be put in kategorial variables to be useful in futher analysis. Integer encoding can be used. The rating contains a score that is reflected in the ascending values. 

First it is checked if there are rows without a rating in the relevant years from 2014 to 2018. The ratings are researched and inputed:

In [None]:
# Filter the DataFrame for the specified conditions
filtered_companies = df_pp2[(df_pp2['Rating 2014'] == 0) &
                           (df_pp2['Rating 2015'] == 0) &
                           (df_pp2['Rating 2016'] == 0) &
                           (df_pp2['Rating 2017'] == 0) &
                           (df_pp2['Rating 2018'] == 0)]

# Get the names of the companies from the filtered DataFrame
company_names = filtered_companies["Company Name"].tolist()

print("Company Names with all Rating values from 2014 to 2018 as 0:")
print(company_names)

In [None]:
# Updated rating_mapping list
rating_mapping = {
    'AMETEK Inc': 'BBB+',
    'ASM International NV': 'BB+', 
    'Atlas Copco AB': 'A+',
    'STMicroelectronics NV': 'BBB', 
    'Welltower Inc': 'BBB+',
    'ABB Ltd': 'A-',
    'Adecco Group AG': 'BBB+',
    'adidas AG': 'A-',
    'Advance Auto Parts Inc': 'BBB-',
    'Advanced Micro Devices Inc': 'A-',
    'Alcon Inc': 'BBB',
    'Alliant Energy Corporation': 'A-',
    'American Airlines Group Inc': 'B-',
    'American Electric Power Company Inc': 'A-',
    'Anglo American plc': 'BBB+',
    'APA Corporation': 'BBB',
    'Associated British Foods plc': 'A',
    'BioRad Laboratories Inc': 'BBB',
    'Brenntag SE': 'BBB',
    'British American Tobacco plc': 'BBB+',
    'BT Group plc': 'BBB',
    'Bunge Limited': 'BBB',
    'Bunzl plc': 'BBB+',
    'Capgemini SE': 'BBB',
    'Carnival Corporation  plc': 'BBB-',
    'CDW Corporation': 'BBB-',
    'Celanese Corporation': 'BBB-',
    'CF Industries Holdings Inc': 'BBB',
    'Charter Communications Inc': 'BB+',
    'Cintas Corporation': 'A-',
    'CNH Industrial NV': 'BBB',
    'ConocoPhillips': 'A-',
    'Consolidated Edison Inc': 'A-',
    'Constellation Energy Corporation': 'BBB-',
    'Corteva Inc': 'A-',
    'CoStar Group Inc': 'BB+',
    'DaVita Inc': 'BB',
    'Devon Energy Corporation': 'BBB',
    'Diageo plc': 'A-',
    'Dow Inc': 'BBB',
    'Eaton Corporation plc': 'A-',
    'EQT Corporation': 'BBB-',
    'Essex Property Trust Inc': 'BBB+',
    'Experian plc': 'A-',
    'Ferrovial SA': 'BBB',
    'Fortinet Inc': 'BBB+',
    'GE HealthCare Technologies Inc': 'BBB',
    'Genuine Parts Company': 'BBB',
    'Givaudan SA': 'A-',
    'Glencore plc': 'BBB+',
    'HCA Healthcare Inc': 'BBB',
    'Hilton Worldwide Holdings Inc': 'BB+',
    'Hormel Foods Corporation': 'A-',
    'Host Hotels  Resorts Inc': 'BBB-',
    'HP Inc': 'BBB',
    'Huntington Ingalls Industries Inc': 'BBB-',
    'Iberdrola SA': 'BBB+',
    'IDEX Corporation': 'BBB',
    'Imperial Brands PLC': 'BBB',
    'Ingersoll Rand Inc': 'BBB-',
    'Kerry Group plc': 'BBB+',
    'Las Vegas Sands Corp': 'BB+',
    'LyondellBasell Industries NV': 'BBB',
    'Marks and Spencer Group plc': 'BBB-',
    'Medtronic plc': 'A',
    'Mohawk Industries Inc': 'BBB+',
    'Mondi plc': 'BBB+',
    'Motorola Solutions Inc': 'BBB-',
    'News Corporation': 'BB+',
    'NextEra Energy Inc': 'A-',
    'Novo Nordisk AS': 'AA-',
    'OMV Aktiengesellschaft': 'AA+',
    'Organon  Co': 'BB',
    'Pentair plc': 'BBB-',
    'Pinnacle West Capital Corporation': 'BBB+',
    'Pioneer Natural Resources Company': 'BBB',
    'Porsche Automobil Holding SE': 'BBB+',
    'PPL Corporation': 'A-',
    'PTC Inc': 'BB+',
    'Qorvo Inc': 'BBB-',
    'Quanta Services Inc': 'BBB-',
    'Regency Centers Corporation': 'BBB+',
    'Regeneron Pharmaceuticals Inc': 'BBB+',
    'Rio Tinto Group': 'A',
    'Roche Holding AG': 'AA',
    'RollsRoyce Holdings plc': 'BBB',
    'Safran SA': 'A-',
    'Schlumberger Limited': 'A',
    'ServiceNow Inc': 'A-',
    'Severn Trent PLC': 'BBB',
    'Skyworks Solutions Inc': 'BBB-',
    'Smurfit Kappa Group Plc': 'BBB-',
    'TakeTwo Interactive Software Inc': 'BBB',
    'Targa Resources Corp': 'BBB-',
    'TE Connectivity Ltd': 'A-',
    'Teledyne Technologies Incorporated': 'BBB',
    'The Kraft Heinz Company': 'BBB',
    'The Sage Group plc': 'BBB+',
    'Thermo Fisher Scientific Inc': 'A-',
    'United Rentals Inc': 'BB+',
    'United Utilities Group PLC': 'A-',
    'Universal Music Group NV': 'BBB',
    'UPMKymmene Oyj': 'BBB',
    'Vonovia SE': 'BBB+',
    'WestRock Company': 'BBB',
    'WPP plc': 'BBB',
}

# Iterate over the rows of the DataFrame and set the ratings accordingly
for index, row in df_pp2.iterrows():
    company_name = row['Company Name']
    rating = rating_mapping.get(company_name)
    if rating is not None:
        df_pp2.at[index, 'Rating 2014'] = rating
        df_pp2.at[index, 'Rating 2015'] = rating
        df_pp2.at[index, 'Rating 2016'] = rating
        df_pp2.at[index, 'Rating 2017'] = rating
        df_pp2.at[index, 'Rating 2018'] = rating

In [None]:
# Iterate over the columns and output the unique values
for year in range(2013, 2019):
    column_name = f'Rating {year}'
    unique_values = df_pp2[column_name].unique()
    print(f'Unique values in {column_name}: {unique_values}')

In [None]:
# Integer Encoding
# Create a dictionary to map the original values to the categorical variables
rating_mapping = {
    0: -1,    # Added: 0 as numerical value for zeros
    "AA+": 1,
    "AA": 2,
    "AA-": 3,
    "A+": 4,
    "A": 5,
    "A-": 6,
    "BBB+": 7,
    "BBB": 8,
    "BBB-": 9,
    "BB+": 10,
    "BB": 11,
    "BB-": 12,
    "B+": 13,
    "B": 14,
    "B-": 15,
    "CCC": 16
}

# Include NaN and Null values in the mapping with -1
rating_mapping[np.nan] = -1
rating_mapping[None] = -1

# Loop through the years and convert the values in each "Rating" column to categorical variables
for year in range(2013, 2020):
    column_name = f'Rating {year}'
    df_pp2[column_name] = df_pp2[column_name].replace(rating_mapping)

# Display the updated DataFrame
df_pp2.head(5)


In [None]:
# Count the occurrences where all Rating values from 2014 to 2018 are 0
count_null_ratings = df_pp2[(df_pp2['Rating 2014'] == -1) &
                            (df_pp2['Rating 2015'] == -1) &
                            (df_pp2['Rating 2016'] == -1) &
                            (df_pp2['Rating 2017'] == -1) &
                            (df_pp2['Rating 2018'] == -1)]

# Display the company names where all ratings from 2014 to 2018 are 0
company_names_with_null_ratings = count_null_ratings['Company Name'].tolist()
print("Company Names where all Rating values from 2014 to 2018 are 0:")
print(company_names_with_null_ratings)


### 3.4.4 Handling empty entries

In [None]:
# Set the option to display all columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Check for null values in df_up_merged
null_counts = df_pp2.isnull().sum()

# Display the result
print("Number of null values in each column of df_up_merged:")
print(null_counts)

The following adjustments are made: 
- Change <year>: Set "no change"
- Financials: median of the column
- Total Employees <year>: use following year or mean
- Gross Profit/ Employee 2018: drop

In [None]:
columns_with_missing_values = [
    "Market Capitalization [12/31/2014] (€EURmm, Historical rate)",
    "Market Capitalization [12/31/2015] (€EURmm, Historical rate)",
    "Market Capitalization [12/31/2016] (€EURmm, Historical rate)",
    "Market Capitalization [My Setting] [12/31/2017] (€EURmm, Historical rate)",
    "Market Capitalization [My Setting] [12/31/2018] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "EBITDA - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "EBIT - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Net Income - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Equity - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Debt - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Assets - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Net Debt - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Gross Profit - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Cash from Ops. - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2014] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2015] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2016] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2017] (€EURmm, Historical rate)",
    "Total Revenue - Capital IQ [CY 2018] (€EURmm, Historical rate)",
    "Equity ratio 2018",
    "Debt ratio (in Prozent) 2018",
    "Debt-equity ratio 2018",  
    "Return on equity 2018",
    "Return on sales 2018",
]

# Iterate over the selected columns
for col in columns_with_missing_values:
    # Identify rows with missing values (NaN or empty)
    missing_values_mask = df_pp2[col].isnull() | (df_pp2[col] == '')

    # Calculate the median value of the column excluding the missing values
    median_value = df_pp2.loc[~missing_values_mask, col].median()

    # Replace the missing values with the median value
    df_pp2.loc[missing_values_mask, col] = median_value

Filling the Total Employyes - either with future value or with median.

In [None]:
# Überprüfe, ob die Spalte "Total Employees - Capital IQ [CY 2015]" gefüllt ist
df_pp2['Total Employees - Capital IQ [CY 2014]'] = df_pp2['Total Employees - Capital IQ [CY 2014]'].fillna(df_pp2['Total Employees - Capital IQ [CY 2015]'])

# Überprüfe, ob die Spalte "Total Employees - Capital IQ [CY 2016]" gefüllt ist
df_pp2['Total Employees - Capital IQ [CY 2014]'] = df_pp2['Total Employees - Capital IQ [CY 2014]'].fillna(df_pp2['Total Employees - Capital IQ [CY 2016]'])

# Überprüfe, ob die Spalte "Total Employees - Capital IQ [CY 2017]" gefüllt ist
df_pp2['Total Employees - Capital IQ [CY 2014]'] = df_pp2['Total Employees - Capital IQ [CY 2014]'].fillna(df_pp2['Total Employees - Capital IQ [CY 2017]'])

# Überprüfe, ob die Spalte "Total Employees - Capital IQ [CY 2018]" gefüllt ist
df_pp2['Total Employees - Capital IQ [CY 2014]'] = df_pp2['Total Employees - Capital IQ [CY 2014]'].fillna(df_pp2['Total Employees - Capital IQ [CY 2018]'])

# Berechne den Median der Spalte "Total Employees - Capital IQ [CY 2014]"
median_employees_2014 = df_pp2['Total Employees - Capital IQ [CY 2014]'].median()

# Fülle die verbleibenden fehlenden Werte mit dem Median der Spalte "Total Employees - Capital IQ [CY 2014]"
df_pp2['Total Employees - Capital IQ [CY 2014]'] = df_pp2['Total Employees - Capital IQ [CY 2014]'].fillna(median_employees_2014)

In [None]:
# Überprüfe, ob die Spalte "Total Employees - Capital IQ [CY 2015]" gefüllt ist
df_pp2['Total Employees - Capital IQ [CY 2015]'] = df_pp2['Total Employees - Capital IQ [CY 2015]'].fillna(df_pp2['Total Employees - Capital IQ [CY 2016]'])

# Überprüfe, ob die Spalte "Total Employees - Capital IQ [CY 2016]" gefüllt ist
df_pp2['Total Employees - Capital IQ [CY 2015]'] = df_pp2['Total Employees - Capital IQ [CY 2015]'].fillna(df_pp2['Total Employees - Capital IQ [CY 2017]'])

# Überprüfe, ob die Spalte "Total Employees - Capital IQ [CY 2018]" gefüllt ist
df_pp2['Total Employees - Capital IQ [CY 2015]'] = df_pp2['Total Employees - Capital IQ [CY 2015]'].fillna(df_pp2['Total Employees - Capital IQ [CY 2018]'])

# Berechne den Median der Spalte "Total Employees - Capital IQ [CY 2014]"
median_employees_2015 = df_pp2['Total Employees - Capital IQ [CY 2015]'].median()

# Fülle die verbleibenden fehlenden Werte mit dem Median der Spalte "Total Employees - Capital IQ [CY 2014]"
df_pp2['Total Employees - Capital IQ [CY 2015]'] = df_pp2['Total Employees - Capital IQ [CY 2015]'].fillna(median_employees_2015)


In [None]:
# Überprüfe, ob die Spalte "Total Employees - Capital IQ [CY 2015]" gefüllt ist
df_pp2['Total Employees - Capital IQ [CY 2016]'] = df_pp2['Total Employees - Capital IQ [CY 2016]'].fillna(df_pp2['Total Employees - Capital IQ [CY 2017]'])

# Überprüfe, ob die Spalte "Total Employees - Capital IQ [CY 2016]" gefüllt ist
df_pp2['Total Employees - Capital IQ [CY 2016]'] = df_pp2['Total Employees - Capital IQ [CY 2016]'].fillna(df_pp2['Total Employees - Capital IQ [CY 2018]'])

# Berechne den Median der Spalte "Total Employees - Capital IQ [CY 2014]"
median_employees_2016 = df_pp2['Total Employees - Capital IQ [CY 2016]'].median()

# Fülle die verbleibenden fehlenden Werte mit dem Median der Spalte "Total Employees - Capital IQ [CY 2014]"
df_pp2['Total Employees - Capital IQ [CY 2016]'] = df_pp2['Total Employees - Capital IQ [CY 2016]'].fillna(median_employees_2016)


In [None]:
# Überprüfe, ob die Spalte "Total Employees - Capital IQ [CY 2015]" gefüllt ist
df_pp2['Total Employees - Capital IQ [CY 2017]'] = df_pp2['Total Employees - Capital IQ [CY 2017]'].fillna(df_pp2['Total Employees - Capital IQ [CY 2018]'])

# Berechne den Median der Spalte "Total Employees - Capital IQ [CY 2014]"
median_employees_2017 = df_pp2['Total Employees - Capital IQ [CY 2017]'].median()

# Fülle die verbleibenden fehlenden Werte mit dem Median der Spalte "Total Employees - Capital IQ [CY 2014]"
df_pp2['Total Employees - Capital IQ [CY 2017]'] = df_pp2['Total Employees - Capital IQ [CY 2017]'].fillna(median_employees_2017)


In [None]:
# Check if "Total Employees - Capital IQ [CY 2018]" is filled
if df_pp2['Total Employees - Capital IQ [CY 2018]'].notnull().any():
    # Fill missing values in "Total Employees - Capital IQ [CY 2018]" with values from "Total Employees - Capital IQ [CY 2017]"
    df_pp2['Total Employees - Capital IQ [CY 2018]'].fillna(df_pp2['Total Employees - Capital IQ [CY 2017]'], inplace=True)

Droping Gross Profit / Employee 2018:

In [None]:
# Drop the column "Gross Profit/ Employee 2018" from df_pp2
df_pp2.drop("Gross Profit/ Employee 2018", axis=1, inplace=True)

No change is beeing set as a value for the missing change indivators:

In [None]:
# List of columns to check and fill with "no change"
columns_to_fill_with_no_change = [
    'Change 2013/14',
    'Change 2014/15',
    'Change 2015/16',
    'Change 2016/17',
]

# Fill the NaN values in the specified columns with "no change"
df_pp2[columns_to_fill_with_no_change] = df_pp2[columns_to_fill_with_no_change].fillna("no change")

Check if all missing fields are eliminated:

In [None]:
# Set the option to display all columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Check for null values in df_up_merged
null_counts = df_pp2.isnull().sum()

# Display the result
print("Number of null values in each column of df_up_merged:")
print(null_counts)

Cleaning of missing values successful.

## 3.5 Transforming into time series

In [None]:
df_pp3 = df_pp2.copy()

In [None]:
# List of columns to be dropped
columns_to_drop = ["Exchange", "Land", "Industry","Ticker", "Geographic Region", "Change 2013/14", "Change 2014/15", "Change 2015/16", "Change 2016/17", "Equity ratio 2018", "Debt ratio (in Prozent) 2018", "Debt-equity ratio 2018","Return on equity 2018","Return on sales 2018",]

# Drop the irrelevant columns
df_pp3 = df_pp3.drop(columns=columns_to_drop)

In [None]:
# Create the new DataFrame with the desired structure
years = [2014, 2015, 2016, 2017, 2018]
df_pp3_new = pd.DataFrame(columns=["Company Name", "Years"] + years)

# Get unique values from the "Company Name" column in the original DataFrame
unique_companies = df_pp3["Company Name"].unique()

# Iterate over each unique company
for company in unique_companies:
    # Create a dictionary to hold the data for the current company
    company_data = {"Company Name": [company] * len(years), "Years": years}
    
    # Append the dictionary to the new DataFrame
    df_pp3_new = df_pp3_new.append(pd.DataFrame(company_data), ignore_index=True)

# Merge the new DataFrame with the original DataFrame on "Company Name"
df_pp3_final = df_pp3_new.merge(df_pp3, on="Company Name", how="left")

# Reorder the columns
df_pp3_final = df_pp3_final[["Company Name", "Years"]]

# Encode "Years" column as datetime64 data type and then format as 'YYYY'
df_pp3_final["Years"] = pd.to_datetime(df_pp3_final["Years"], format='%Y').dt.strftime('%Y')
df_pp3_final.head(2)

In [None]:

# Create a mapping of old column names to new column names
column_mapping = {
    "Market Capitalization [12/31/2014] (€EURmm, Historical rate)": "Market Capitalization 2014",
    "Market Capitalization [12/31/2015] (€EURmm, Historical rate)": "Market Capitalization 2015",
    "Market Capitalization [12/31/2016] (€EURmm, Historical rate)": "Market Capitalization 2016",
    "Market Capitalization [My Setting] [12/31/2017] (€EURmm, Historical rate)": "Market Capitalization 2017",
    "Market Capitalization [My Setting] [12/31/2018] (€EURmm, Historical rate)": "Market Capitalization 2018",
    "EBITDA - Capital IQ [CY 2014] (€EURmm, Historical rate)": "EBITDA 2014",
    "EBITDA - Capital IQ [CY 2015] (€EURmm, Historical rate)": "EBITDA 2015",
    "EBITDA - Capital IQ [CY 2016] (€EURmm, Historical rate)": "EBITDA 2016",
    "EBITDA - Capital IQ [CY 2017] (€EURmm, Historical rate)": "EBITDA 2017",
    "EBITDA - Capital IQ [CY 2018] (€EURmm, Historical rate)": "EBITDA 2018",
    "EBIT - Capital IQ [CY 2014] (€EURmm, Historical rate)": "EBIT 2014",
    "EBIT - Capital IQ [CY 2015] (€EURmm, Historical rate)": "EBIT 2015",
    "EBIT - Capital IQ [CY 2016] (€EURmm, Historical rate)": "EBIT 2016",
    "EBIT - Capital IQ [CY 2017] (€EURmm, Historical rate)": "EBIT 2017",
    "EBIT - Capital IQ [CY 2018] (€EURmm, Historical rate)": "EBIT 2018",
    "Net Income - Capital IQ [CY 2014] (€EURmm, Historical rate)": "Net Income 2014",
    "Net Income - Capital IQ [CY 2015] (€EURmm, Historical rate)": "Net Income 2015",
    "Net Income - Capital IQ [CY 2016] (€EURmm, Historical rate)": "Net Income 2016",
    "Net Income - Capital IQ [CY 2017] (€EURmm, Historical rate)": "Net Income 2017",
    "Net Income - Capital IQ [CY 2018] (€EURmm, Historical rate)": "Net Income 2018",
    "Total Equity - Capital IQ [CY 2014] (€EURmm, Historical rate)": "Total Equity 2014",
    "Total Equity - Capital IQ [CY 2015] (€EURmm, Historical rate)": "Total Equity 2015",
    "Total Equity - Capital IQ [CY 2016] (€EURmm, Historical rate)": "Total Equity 2016",
    "Total Equity - Capital IQ [CY 2017] (€EURmm, Historical rate)": "Total Equity 2017",
    "Total Equity - Capital IQ [CY 2018] (€EURmm, Historical rate)": "Total Equity 2018",
    "Total Debt - Capital IQ [CY 2014] (€EURmm, Historical rate)": "Total Debt 2014",
    "Total Debt - Capital IQ [CY 2015] (€EURmm, Historical rate)": "Total Debt 2015",
    "Total Debt - Capital IQ [CY 2016] (€EURmm, Historical rate)": "Total Debt 2016",
    "Total Debt - Capital IQ [CY 2017] (€EURmm, Historical rate)": "Total Debt 2017",
    "Total Debt - Capital IQ [CY 2018] (€EURmm, Historical rate)": "Total Debt 2018",
    "Total Assets - Capital IQ [CY 2014] (€EURmm, Historical rate)": "Total Assets 2014",
    "Total Assets - Capital IQ [CY 2015] (€EURmm, Historical rate)": "Total Assets 2015",
    "Total Assets - Capital IQ [CY 2016] (€EURmm, Historical rate)": "Total Assets 2016",
    "Total Assets - Capital IQ [CY 2017] (€EURmm, Historical rate)": "Total Assets 2017",
    "Total Assets - Capital IQ [CY 2018] (€EURmm, Historical rate)": "Total Assets 2018",
    "Net Debt - Capital IQ [CY 2014] (€EURmm, Historical rate)": "Net Debt 2014",
    "Net Debt - Capital IQ [CY 2015] (€EURmm, Historical rate)": "Net Debt 2015",
    "Net Debt - Capital IQ [CY 2016] (€EURmm, Historical rate)": "Net Debt 2016",
    "Net Debt - Capital IQ [CY 2017] (€EURmm, Historical rate)": "Net Debt 2017",
    "Net Debt - Capital IQ [CY 2018] (€EURmm, Historical rate)": "Net Debt 2018",
    "Gross Profit - Capital IQ [CY 2014] (€EURmm, Historical rate)": "Gross Profit 2014",
    "Gross Profit - Capital IQ [CY 2015] (€EURmm, Historical rate)": "Gross Profit 2015",
    "Gross Profit - Capital IQ [CY 2016] (€EURmm, Historical rate)": "Gross Profit 2016",
    "Gross Profit - Capital IQ [CY 2017] (€EURmm, Historical rate)": "Gross Profit 2017",
    "Gross Profit - Capital IQ [CY 2018] (€EURmm, Historical rate)": "Gross Profit 2018",
    "Total Employees - Capital IQ [CY 2014]": "Total Employees 2014",
    "Total Employees - Capital IQ [CY 2015]": "Total Employees 2015",
    "Total Employees - Capital IQ [CY 2016]": "Total Employees 2016",
    "Total Employees - Capital IQ [CY 2017]": "Total Employees 2017",
    "Total Employees - Capital IQ [CY 2018]": "Total Employees 2018",
    "Cash from Ops. - Capital IQ [CY 2014] (€EURmm, Historical rate)": "Cash from Ops. 2014",
    "Cash from Ops. - Capital IQ [CY 2015] (€EURmm, Historical rate)": "Cash from Ops. 2015",
    "Cash from Ops. - Capital IQ [CY 2016] (€EURmm, Historical rate)": "Cash from Ops. 2016",
    "Cash from Ops. - Capital IQ [CY 2017] (€EURmm, Historical rate)": "Cash from Ops. 2017",
    "Cash from Ops. - Capital IQ [CY 2018] (€EURmm, Historical rate)": "Cash from Ops. 2018",
    "Total Revenue - Capital IQ [CY 2014] (€EURmm, Historical rate)": "Total Revenue 2014",
    "Total Revenue - Capital IQ [CY 2015] (€EURmm, Historical rate)": "Total Revenue 2015",
    "Total Revenue - Capital IQ [CY 2016] (€EURmm, Historical rate)": "Total Revenue 2016",
    "Total Revenue - Capital IQ [CY 2017] (€EURmm, Historical rate)": "Total Revenue 2017",
    "Total Revenue - Capital IQ [CY 2018] (€EURmm, Historical rate)": "Total Revenue 2018",
    "Average years of service with the company": "Average years of service with the company 2018",
    "More than once/different position": "More than once/different position 2018",
}

# Rename the columns in df_pp3
df_pp3.rename(columns=column_mapping, inplace=True)
df_pp3.head(3)


In [None]:
# Copy Number of notices 2017 and insert it as Number of notices 2018
df_pp3["Number of notices 2018"] = df_pp3["Number of notices 2017"]

# Copy Migrating work experience 2017 and insert it as Migrating work experience 2018
df_pp3["Migrating work experience 2018"] = df_pp3["Migrating work experience 2017"]

# Copy Number of New Joiners 2015 and insert it as Number of New Joiners 2014
df_pp3.insert(df_pp3.columns.get_loc("Employee development 2015"), "Employee development 2014", df_pp3["Employee development 2015"])


# Copy Number of New Joiners 2015 and insert it as Number of New Joiners 2014
df_pp3.insert(df_pp3.columns.get_loc("Number of New Joiners 2015"), "Number of New Joiners 2014", df_pp3["Number of New Joiners 2015"])

# Copy New joining work experience 2015 and insert it as New joining work experience 2014
df_pp3.insert(df_pp3.columns.get_loc("New joining work experience 2015"), "New joining work experience 2014", df_pp3["New joining work experience 2015"])

# Display the updated df_pp3 DataFrame
df_pp3.head(3)

In [None]:
# Function to look up the value for a given Company Name and Year in a specified column
def lookup_value(company_name, year, column_name):
    value_column = f"{column_name} {year}"
    return df_pp3.loc[df_pp3["Company Name"] == company_name, value_column].values[0]

# List of columns to update
columns_to_update = ["Rating","Market Capitalization","EBITDA", "EBIT","Net Income","Total Equity","Total Debt","Total Assets","Net Debt","Gross Profit","Total Employees","Cash from Ops.","Total Revenue", "Number of employees", "Employee development", "Number of notices", "Migrating work experience","Number of New Joiners", "New joining work experience", "Fluctuation rate",]

# Iterate over each row in df_pp3_final
for index, row in df_pp3_final.iterrows():
    # Get the Company Name and Year from the current row
    company_name = row["Company Name"]
    year = row["Years"]
    
    # Iterate over each column to update
    for column_name in columns_to_update:
        # Look up the value for the Company Name, Year, and current column
        value = lookup_value(company_name, year, column_name)
        
        # Assign the value to the appropriate cell in df_pp3_final
        df_pp3_final.at[index, column_name] = value

df_pp3_final.head(10)


In [None]:
# Add a new column "Default" to df_pp3_final and initialize it with the value 0
df_pp3_final["Downgrade"] = 0

df_pp3_final.head(1)

In [None]:
companies_with_default_1 = df_pp3[df_pp3["Downgrade 2017"] == 1]["Company Name"].unique()
print(companies_with_default_1)

In [None]:
companies_with_default_1 = df_pp3[df_pp3["Downgrade 2018"] == 1]["Company Name"].unique()
print(companies_with_default_1)

In [None]:
# Filter df_pp3_final for the year 2018
df_pp3_final_year = df_pp3_final[df_pp3_final["Years"] == "2018"]

# List of companies to filter
companies_to_filter = ['ATT Inc', 'Bath  Body Works Inc', 'Carrefour SA', 'Centrica plc',
                       'Charles River Laboratories International Inc', 'Duke Energy Corporation',
                       'Edison International', 'Eli Lilly and Company', 'Equifax Inc',
                       'Eversource Energy', 'Ford Motor Company', 'LKQ Corporation',
                       'ParkerHannifin Corporation', 'Publicis Groupe SA', 'QUALCOMM Incorporated',
                       'Renault SA', 'Starbucks Corporation', 'Stryker Corporation',
                       'The Boeing Company', 'The Walt Disney Company', 'CenterPoint Energy Inc','Gilead Sciences Inc', 'Marriott International Inc']

# Filter df_pp3_final_year for the specified companies
df_pp3_final_filtered = df_pp3_final_year[df_pp3_final_year["Company Name"].isin(companies_to_filter)]

# Set "Downgrade" to 1 for the filtered companies
df_pp3_final.loc[df_pp3_final_filtered.index, "Downgrade"] = 1

# Display the updated DataFrame, Check for ATT Inc in 184
df_pp3_final.head(185)

In [None]:
# Count the occurrences of each value in the "Downgrade" column
downgrade_counts = df_pp3_final["Downgrade"].value_counts()

# Print the result
print(downgrade_counts)

In [None]:
# Filter df_pp3_final for the year 2018
df_pp3_final_year = df_pp3_final[df_pp3_final["Years"] == "2017"]

# List of companies to filter
companies_to_filter = ['Akzo Nobel NV', 'Campbell Soup Company',
 'Charles River Laboratories International Inc', 'CVS Health Corporation',
 'Engie SA', 'General Mills Inc', 'Keurig Dr Pepper Inc', 'SSE plc',
 'Telia Company AB',]

# Filter df_pp3_final_year for the specified companies
df_pp3_final_filtered = df_pp3_final_year[df_pp3_final_year["Company Name"].isin(companies_to_filter)]

# Set "Downgrade" to 1 for the filtered companies
df_pp3_final.loc[df_pp3_final_filtered.index, "Downgrade"] = 1

# Display the updated DataFrame
df_pp3_final.head(2)

In [None]:
# Filter df_pp3_final for the year 2016
df_pp3_final_year = df_pp3_final[df_pp3_final["Years"] == "2016"]

# List of companies to filter
companies_to_filter = ['FreeportMcMoRan Inc', 'Ralph Lauren Corporation']

# Filter df_pp3_final_year for the specified companies
df_pp3_final_filtered = df_pp3_final_year[df_pp3_final_year["Company Name"].isin(companies_to_filter)]

# Set "Downgrade" to 1 for the filtered companies
df_pp3_final.loc[df_pp3_final_filtered.index, "Downgrade"] = 1

# Display the updated DataFrame
df_pp3_final.head(2)

In [None]:
# Filter df_pp3_final for the year 2015
df_pp3_final_year = df_pp3_final[df_pp3_final["Years"] == "2015"]

# List of companies to filter
companies_to_filter = ['ConocoPhillips', 'eBay Inc', 'EOG Resources Inc', 'FreeportMcMoRan Inc']

# Filter df_pp3_final_year for the specified companies
df_pp3_final_filtered = df_pp3_final_year[df_pp3_final_year["Company Name"].isin(companies_to_filter)]

# Set "Downgrade" to 1 for the filtered companies
df_pp3_final.loc[df_pp3_final_filtered.index, "Downgrade"] = 1

# Display the updated DataFrame
df_pp3_final.head(2)

In [None]:
# Filter df_pp3_final for the year 2014
df_pp3_final_year = df_pp3_final[df_pp3_final["Years"] == "2014"]

# List of companies to filter
companies_to_filter = ['FreeportMcMoRan Inc']

# Filter df_pp3_final_year for the specified companies
df_pp3_final_filtered = df_pp3_final_year[df_pp3_final_year["Company Name"].isin(companies_to_filter)]

# Set "Downgrade" to 1 for the filtered companies
df_pp3_final.loc[df_pp3_final_filtered.index, "Downgrade"] = 1

# Display the updated DataFrame
df_pp3_final.head(2)

In [None]:
# Filter df_pp3_final for the year 2014
df_pp3_final_year = df_pp3_final[df_pp3_final["Years"] == 2014]

# Update the Rating for the specified companies
new_ratings = {'CenterPoint Energy Inc': 6, 'ConocoPhillips': 5, 'eBay Inc': -1, 'EOG Resources Inc': 6, 'FreeportMcMoRan Inc': 9, 'Marriott International Inc': 8, 'Ralph Lauren Corporation': 5}

for company, new_rating in new_ratings.items():
    company_indices = df_pp3_final_year[df_pp3_final_year["Company Name"] == company].index
    df_pp3_final.loc[company_indices, "Rating"] = new_rating

In [None]:
# Filter df_pp3_final for the year 2015
df_pp3_final_year = df_pp3_final[df_pp3_final["Years"] == 2015]

# Update the Rating for the specified companies
new_ratings = {'CenterPoint Energy Inc': 6, 'ConocoPhillips': 6, 'eBay Inc': 5, 'EOG Resources Inc': 7, 'FreeportMcMoRan Inc': 11, 'Marriott International Inc': 8, 'Ralph Lauren Corporation': 5}

for company, new_rating in new_ratings.items():
    company_indices = df_pp3_final_year[df_pp3_final_year["Company Name"] == company].index
    df_pp3_final.loc[company_indices, "Rating"] = new_rating

In [None]:
# Filter df_pp3_final for the year 2016
df_pp3_final_year = df_pp3_final[df_pp3_final["Years"] == 2016]

# Update the Rating for the specified companies
new_ratings = {'CenterPoint Energy Inc': 6, 'ConocoPhillips': 6, 'EOG Resources Inc': 7, 'FreeportMcMoRan Inc': 12,'Gilead Sciences Inc': 5, 'Marriott International Inc': 8, 'Ralph Lauren Corporation': 6}

for company, new_rating in new_ratings.items():
    company_indices = df_pp3_final_year[df_pp3_final_year["Company Name"] == company].index
    df_pp3_final.loc[company_indices, "Rating"] = new_rating

In [None]:
# Filter df_pp3_final for the year 2017
df_pp3_final_year = df_pp3_final[df_pp3_final["Years"] == 2017]

# Update the Rating for the specified companies
new_ratings = {'CenterPoint Energy Inc': 6, 'ConocoPhillips': 6, 'EOG Resources Inc': 7,'Eversource Energy': 4, 'FreeportMcMoRan Inc': 12,'Gilead Sciences Inc': 5, 'Marriott International Inc': 8, 'Ralph Lauren Corporation': 6}

for company, new_rating in new_ratings.items():
    company_indices = df_pp3_final_year[df_pp3_final_year["Company Name"] == company].index
    df_pp3_final.loc[company_indices, "Rating"] = new_rating

In [None]:
# Filter df_pp3_final for the year 2018
df_pp3_final_year = df_pp3_final[df_pp3_final["Years"] == 2018]

# Update the Rating for the specified companies
new_ratings = {'CenterPoint Energy Inc': 7, 'ConocoPhillips': 6, 'EOG Resources Inc': 7,'Eversource Energy': -1, 'FreeportMcMoRan Inc': 12,'Gilead Sciences Inc': -1, 'Marriott International Inc': 9, 'Ralph Lauren Corporation': 6}

for company, new_rating in new_ratings.items():
    company_indices = df_pp3_final_year[df_pp3_final_year["Company Name"] == company].index
    df_pp3_final.loc[company_indices, "Rating"] = new_rating

In [None]:
# Convert the "Downgrade" column to integer
df_pp3_final["Downgrade"] = df_pp3_final["Downgrade"].astype(int)

In [None]:
# Count the occurrences of each value in the "Downgrade" column
downgrade_counts = df_pp3_final["Downgrade"].value_counts()

# Print the result
print(downgrade_counts)

In [None]:
#Set Index
df_pp3_final.set_index("Years", inplace=True)

In [None]:
# Verwende die Methode factorize und erhöhe das Ergebnis um 1, um eine eindeutige ID für jeden eindeutigen Wert in der Spalte "Company Name" zu erhalten
df_pp3_final["Company ID"] = pd.factorize(df_pp3_final["Company Name"])[0] + 1
df_pp3_final.head(5)

In [None]:
# Assuming dfm is the DataFrame you want to save
# Replace 'C:\\Users\\chiar\\OneDrive\\Masterthesis\\Modell\\Dataframes\\Default' with your desired path
file_path = 'C:\\Users\\chiar\\OneDrive\\Masterthesis\\Modell\\Dataframes\\Downgrade\\downgrade_all.csv'

# Save the DataFrame to a CSV file
df_pp3_final.to_csv(file_path, index=False)


In [None]:
# Drop "Company Name"
df_pp3_final.drop(columns=["Company Name"], inplace=True)

## 3.6 Check for multicollinearity

In [None]:
df_pp4 = df_pp3_final.copy()

In [None]:
df_pp4.shape

In [None]:
correlation_matrix = df_pp4.corr()

plt.figure(figsize=(40, 32))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", xticklabels="auto", yticklabels="auto")
plt.title("Korrelationsmatrix")
plt.show()

In [None]:
# Berechne die Korrelationsmatrix
correlation_matrix = df_pp4.corr()

# Erstelle eine leere Liste, um die Ergebnisse zu speichern
correlation_results = []

# Iteriere über die Spalten der Korrelationsmatrix und berechne die Korrelation zwischen jedem Feature-Paar
for i, feature1 in enumerate(correlation_matrix.columns):
    for j, feature2 in enumerate(correlation_matrix.columns):
        if i < j:
            correlation_value = correlation_matrix.iloc[i, j]
            correlation_results.append([feature1, feature2, correlation_value])

# Erstelle ein DataFrame mit den Korrelationsergebnissen
correlation_df = pd.DataFrame(correlation_results, columns=['Feature 1', 'Feature 2', 'Korrelationswert'])

# Zeige das DataFrame mit den Korrelationsergebnissen an
correlation_df.head(10)

In [None]:
# Filtere die Korrelationswerte: Über 0,7 und nicht gleich 1
filtered_correlation_df = correlation_df[
    (correlation_df['Korrelationswert'] > 0.7) & (correlation_df['Korrelationswert'] < 1)
]

# Zeige das DataFrame mit den gefilterten Korrelationsergebnissen an
filtered_correlation_df

In the dataset, there are metrics for multiple years, and it is observed that these metrics exhibit strong correlations among themselves. From a domain perspective, this is understandable and indicates a stable company. It is important to note that the LinkedIn and Finance KPIs do not show a high correlation to each other.

In [None]:
# Filter the rows where 'Change_2018/19_down grade' appears in either 'Feature 1' or 'Feature 2'
correlation_results_filtered = correlation_df[(correlation_df['Feature 1'] == 'Downgrade') | 
                                              (correlation_df['Feature 2'] == 'Downgrade')]

# Sort the results based on the absolute value of correlation in descending order
correlation_results_filtered = correlation_results_filtered.iloc[correlation_results_filtered['Korrelationswert'].abs().argsort()[::-1]]
correlation_results_filtered

The heat map visulaizes the correlation of the variables in a one-to-one relationship. Since no correlation is overproportionally high, the vif value is used to check the interaction between several variables.

In [None]:
variables = df_pp4[['Rating', 'Market Capitalization', 'EBITDA', 'EBIT', 'Net Income',
       'Total Equity', 'Total Debt', 'Total Assets', 'Net Debt',
       'Gross Profit', 'Total Employees', 'Cash from Ops.', 'Total Revenue',
       'Number of employees', 'Employee development', 'Number of notices',
       'Migrating work experience', 'Number of New Joiners',
       'New joining work experience', 'Fluctuation rate','Company ID']]
vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(variables.values, i) for i in range(variables.shape[1])]
vif['Features'] = variables.columns

In [None]:
vif

High vif in the fianancial kpis and linkedin features should be cleaned.

In [None]:
df_pp4 = df_pp4.drop(['Number of employees', 'EBITDA', 'Total Debt', 'Cash from Ops.'],axis = 1)

In [None]:
variables = df_pp4[['Rating', 'Market Capitalization', 'EBIT', 'Net Income',
       'Total Equity', 'Total Assets', 'Net Debt',
       'Gross Profit', 'Total Employees', 'Total Revenue',
       'Employee development', 'Number of notices',
       'Migrating work experience', 'Number of New Joiners',
       'New joining work experience', 'Fluctuation rate','Company ID']]
vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(variables.values, i) for i in range(variables.shape[1])]
vif['Features'] = variables.columns

In [None]:
vif

Vif in New joining work experience and Gross profit ist accepted, since it is close to 10.

# 4. Modeling 

In the following, the required data frames are first formed on the basis of the cleaned data (4.0). Then the ML models are first applied to the financial ratios (4.1), then to the LinkedIn features (4.2) and finally to the combined data (4.3).

In [None]:
dfm = df_pp4.copy()

In [None]:
dfm.shape

## 4.1 Building dataframes

Create the required data frames:
- df_financials
- df_linkedin
- df_com

In [None]:
# getting column names
column_names = ["{}".format(col) for col in dfm.columns]
print(column_names)

In [None]:
#Columns for financials
selected_columns = ['Market Capitalization', 'EBIT', 'Net Income', 'Total Equity', 'Total Assets', 'Net Debt', 'Gross Profit', 'Total Employees', 'Total Revenue', 'Downgrade',]
# Creat new dataframe
df_financials = dfm[selected_columns]
df_financials.head(4)

In [None]:
# Assuming dfm is the DataFrame you want to save
# Replace 'C:\\Users\\chiar\\OneDrive\\Masterthesis\\Modell\\Dataframes\\Default' with your desired path
file_path = 'C:\\Users\\chiar\\OneDrive\\Masterthesis\\Modell\\Dataframes\\Downgrade\\Financials\\downgrade_financials.csv'

# Save the DataFrame to a CSV file
df_financials.to_csv(file_path, index=False)


In [None]:
#Columns for linkedin
selected_columns = ['Employee development', 'Number of notices', 'Migrating work experience', 'Number of New Joiners', 'New joining work experience', 'Fluctuation rate', 'Downgrade',]
# Creat new dataframe
df_linkedin = dfm[selected_columns]
df_linkedin.head(4)

In [None]:
# Replace 'C:\\Users\\chiar\\OneDrive\\Masterthesis\\Modell\\Dataframes\\Default' with your desired path
file_path = 'C:\\Users\\chiar\\OneDrive\\Masterthesis\\Modell\\Dataframes\\Downgrade\\LinkedIn\\downgrade_LinkedIn.csv'

# Save the DataFrame to a CSV file
df_linkedin.to_csv(file_path, index=False)


In [None]:
df_com = dfm.copy()

In [None]:
dfm.drop("Company ID", axis=1, inplace=True)

In [None]:
# Assuming dfm is the DataFrame you want to save
# Replace 'C:\\Users\\chiar\\OneDrive\\Masterthesis\\Modell\\Dataframes\\Default' with your desired path
file_path = 'C:\\Users\\chiar\\OneDrive\\Masterthesis\\Modell\\Dataframes\\Downgrade\\Com\\downgrade_combined.csv'

# Save the DataFrame to a CSV file
df_com.to_csv(file_path, index=False)


## 4.2 Running modells on df_financials

### 4.2.1 Splitting between train and test data

In [None]:
# Convert the index values to integers
df_financials.index = df_financials.index.astype(int)

# Splitting the data into training and test data
train_df = df_financials[df_financials.index < 2018]   # All years before 2018 will be used as training data
test_df = df_financials[df_financials.index == 2018]   # Data for the year 2018 will be used as test data

# Defining the features
features = ['Market Capitalization', 'EBIT', 'Net Income', 'Total Equity', 'Total Assets', 'Net Debt', 'Gross Profit', 'Total Employees', 'Total Revenue',]  # Here you would specify the desired features

# Splitting the features and the target variable
X_train_pre = train_df[features] #training data from all properties that are not the target column (80%).
y_train_pre = train_df["Downgrade"] #Training data from the target variable (80%)
X_test = test_df[features] #analog X_train, but only 20%.
y_test = test_df["Downgrade"] #analog Y_train, but only 20%.

In [None]:
X_train_pre.shape

In [None]:
y_train_pre.shape

### 4.2.2 Generating synthetic data of train data

In [None]:
# Erstellen Sie eine Instanz der ADASYN-Klasse
adasyn = ADASYN(random_state=42)

# Anwenden von ADASYN, um synthetische Daten zu generieren
X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train_pre, y_train_pre)

# Sie können auch die generierten numpy-Arrays wieder in Dataframes umwandeln, falls erforderlich
X_train = pd.DataFrame(X_train_adasyn, columns=X_train_pre.columns)
y_train = pd.Series(y_train_adasyn, name=y_train_pre.name)


In [None]:
temp1 = pd.DataFrame(y_train_pre)
temp2 = pd.DataFrame(y_train)

print('Before SMOTE')
print(temp1['Downgrade'].value_counts())
print('After SMOTE')
print(temp2['Downgrade'].value_counts())

In [None]:
temp3 = pd.DataFrame(y_test)

print('Check for test data')
print(temp3['Downgrade'].value_counts())

Oversampling of train data succsessfull. Test data still unbalanced.

In [None]:
X_test.to_csv(r'C:\Users\chiar\OneDrive\Masterthesis\Modell\Dataframes\Downgrade\Financials\x_test_financials.csv', index=False)
X_train.to_csv(r'C:\Users\chiar\OneDrive\Masterthesis\Modell\Dataframes\Downgrade\Financials\x_train_financials.csv', index=False)
y_test.to_csv(r'C:\Users\chiar\OneDrive\Masterthesis\Modell\Dataframes\Downgrade\Financials\y_test_financials.csv', index=False)
y_train.to_csv(r'C:\Users\chiar\OneDrive\Masterthesis\Modell\Dataframes\Downgrade\Financials\y_train_financials.csv', index=False)


### 4.2.4  Logistic Regression

In [None]:
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)
warnings.filterwarnings("ignore")

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
# Here the X-target variable is compared with the predicted values
cm = confusion_matrix(y_test, y_pred)
 
print ("Confusion Matrix : \n", cm)

In [None]:
#Visualization
plot_confusion_matrix(classifier,X_test,y_test,cmap='Blues')
plt.grid(False)

In [None]:
tn, fp, fn, tp = cm.ravel()
recall = tp/(fn+tp)
precision = tp/(tp+fp)
print("True Negatives: " + str(tn))
print("False Positives: " + str(fp))
print("False Negatives: " + str(fn))
print("True Positives: " + str(tp))
print("Recall: " + str(recall))
print("Precision: " + str(precision))

In [None]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, y_pred)
print ("Accuracy : ", acc)

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
# Calculate the predicted probabilities for the positive class (class 1)
y_pred_prob = classifier.predict_proba(X_test)[:, 1]

# Calculate the AUC score
auc_score = roc_auc_score(y_test, y_pred_prob)
print("AUC Score:", auc_score)

For verification, we check how the target variable of the training data is predicted. Therefore, the y_train is predicted using logistic regression and using the properties (x_train).

In [None]:
y_train_pred = classifier.predict(X_train)

In [None]:
# Comparison and results check 
print(classification_report(y_train,y_train_pred))

In [None]:
Precisions_financials = {
    "Logistic Regression": [0.04, 0.09, 0.80, 0.45],}
# precision, recall, acc, auc

### 4.2.5 Decision Tree

In [None]:
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)

In [None]:
y_pred_tree = tree.predict(X_test)

In [None]:
plot_confusion_matrix(tree,X_test,y_test,cmap='Blues')
plt.grid(False)

In [None]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, y_pred_tree)
print ("Accuracy : ", acc)

In [None]:
print(classification_report(y_test, y_pred_tree))

In [None]:
# Calculate the predicted probabilities for the positive class (class 1)
y_pred_prob_tree = tree.predict_proba(X_test)[:, 1]

# Calculate the AUC score for the Decision Tree classifier
auc_score_tree = roc_auc_score(y_test, y_pred_prob_tree)
print("AUC Score for Decision Tree:", auc_score_tree)


In [None]:
y_train_pred_tree = tree.predict(X_train)

In [None]:
plot_confusion_matrix(tree,X_train,y_train, cmap='Blues')
plt.grid(False)

In [None]:
print(classification_report(y_train, y_train_pred_tree))

In [None]:
Precisions_financials.update({
    "Decision Tree": [0.16, 0.13, 0.89, 0.54]
})
# precision, recall, acc, auc

### 4.2.6 Random Forest

In [None]:
tree_depth = [5, 10, 20]
for i in tree_depth:
    rf = RandomForestClassifier(max_depth=i)
    rf.fit(X_train, y_train)
    print('Max tree depth: ', i)
    print('Train results: ', classification_report(y_train, rf.predict(X_train)))
    print('Test results: ',classification_report(y_test, rf.predict(X_test)))

In [None]:
feature_scores = pd.Series(rf.feature_importances_, index=X_train.columns).sort_values(ascending=False)
feature_scores

In [None]:
# Calculate the predicted probabilities for the positive class (class 1)
y_pred_prob_rf = rf.predict_proba(X_test)[:, 1]

# Calculate the AUC score for the RandomForestClassifier
auc_score_rf = roc_auc_score(y_test, y_pred_prob_rf)
print('AUC Score for Random Forest:', auc_score_rf)

In [None]:
Precisions_financials.update({
    "Random Forest": [0.18, 0.09, 0.91, 0.47],
})

### 4.2.7 XGBoost 

In [None]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

In [None]:
importance = plot_importance(xgb, height=0.9, max_num_features=10)
plt.show()


In [None]:
print('Train results: ', classification_report(y_train, xgb.predict(X_train)))
print('Test results: ',classification_report(y_test, xgb.predict(X_test)))

In [None]:
# Calculate the predicted probabilities for the positive class (class 1)
y_pred_prob_xgb = xgb.predict_proba(X_test)[:, 1]

# Calculate the AUC score for the XGBoost classifier
auc_score_xgb = roc_auc_score(y_test, y_pred_prob_xgb)
print('AUC Score for XGBoost:', auc_score_xgb)

In [None]:
Precisions_financials.update({
    "XGBoost": [0.18, 0.09, 0.91, 0.46],
})

## 4.3 Running modells on df_linkedin

### 4.3.1 Splitting between train and test data

In [None]:
# Convert the index values to integers
df_linkedin.index = df_linkedin.index.astype(int)

# Splitting the data into training and test data
train_df = df_linkedin[df_linkedin.index < 2018]   # All years before 2018 will be used as training data
test_df = df_linkedin[df_linkedin.index == 2018]   # Data for the year 2018 will be used as test data

# Defining the features
features = ['Employee development', 'Number of notices', 'Migrating work experience', 'Number of New Joiners', 'New joining work experience', 'Fluctuation rate',]  # Here you would specify the desired features

# Splitting the features and the target variable
X_train_pre = train_df[features] #training data from all properties that are not the target column (80%).
y_train_pre = train_df["Downgrade"] #Training data from the target variable (80%)
X_test = test_df[features] #analog X_train, but only 20%.
y_test = test_df["Downgrade"] #analog Y_train, but only 20%.

In [None]:
X_train_pre.shape

### 4.3.2 Generating syntetic data of train data

In [None]:
# Erstellen Sie eine Instanz der ADASYN-Klasse
adasyn = ADASYN(random_state=42)

# Anwenden von ADASYN, um synthetische Daten zu generieren
X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train_pre, y_train_pre)

# Sie können auch die generierten numpy-Arrays wieder in Dataframes umwandeln, falls erforderlich
X_train = pd.DataFrame(X_train_adasyn, columns=X_train_pre.columns)
y_train = pd.Series(y_train_adasyn, name=y_train_pre.name)

In [None]:
X_test.to_csv(r'C:\Users\chiar\OneDrive\Masterthesis\Modell\Dataframes\Downgrade\LinkedIn\x_test_linkedin.csv', index=False)
X_train.to_csv(r'C:\Users\chiar\OneDrive\Masterthesis\Modell\Dataframes\Downgrade\LinkedIn\x_train_linkedin.csv', index=False)
y_test.to_csv(r'C:\Users\chiar\OneDrive\Masterthesis\Modell\Dataframes\Downgrade\LinkedIn\y_test_linkedin.csv', index=False)
y_train.to_csv(r'C:\Users\chiar\OneDrive\Masterthesis\Modell\Dataframes\Downgrade\LinkedIn\y_train_linkedin.csv', index=False)

In [None]:
temp1 = pd.DataFrame(y_train_pre)
temp2 = pd.DataFrame(y_train)

print('Before SMOTE')
print(temp1['Downgrade'].value_counts())
print('After SMOTE')
print(temp2['Downgrade'].value_counts())

In [None]:
temp3 = pd.DataFrame(y_test)

print('Check for test data')
print(temp3['Downgrade'].value_counts())

Oversampling successful.

### 4.3.3  Logistische Regression

In [None]:
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)
warnings.filterwarnings("ignore")

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
# Here the X-target variable is compared with the predicted values
cm = confusion_matrix(y_test, y_pred)
 
print ("Confusion Matrix : \n", cm)

In [None]:
#Visualization
plot_confusion_matrix(classifier,X_test,y_test,cmap='Blues')
plt.grid(False)

In [None]:
tn, fp, fn, tp = cm.ravel()
recall = tp/(fn+tp)
precision = tp/(tp+fp)
print("True Negatives: " + str(tn))
print("False Positives: " + str(fp))
print("False Negatives: " + str(fn))
print("True Positives: " + str(tp))
print("Recall: " + str(recall))
print("Precision: " + str(precision))

In [None]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, y_pred)
print ("Accuracy : ", acc)

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
# Calculate the predicted probabilities for the positive class (class 1)
y_pred_prob = classifier.predict_proba(X_test)[:, 1]

# Calculate the AUC score
auc_score = roc_auc_score(y_test, y_pred_prob)
print("AUC Score:", auc_score)

In [None]:
y_train_pred = classifier.predict(X_train)

In [None]:
# Comparison and results check 
print(classification_report(y_train,y_train_pred))

In [None]:
Precisions_linkedin = {
    "Logistic Regression": [0.07, 0.96, 0.12 ,0.56],}
# first test, then train

### 4.3.4 Decision Tree

In [None]:
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)

In [None]:
y_pred_tree = tree.predict(X_test)

In [None]:
plot_confusion_matrix(tree,X_test,y_test,cmap='Blues')
plt.grid(False)

In [None]:
acc = accuracy_score(y_test, y_pred_tree)
print ("Accuracy : ", acc)

In [None]:
print(classification_report(y_test, y_pred_tree))

In [None]:
# Calculate the predicted probabilities for the positive class (class 1)
y_pred_prob_tree = tree.predict_proba(X_test)[:, 1]

# Calculate the AUC score for the Decision Tree classifier
auc_score_tree = roc_auc_score(y_test, y_pred_prob_tree)
print("AUC Score for Decision Tree:", auc_score_tree)

In [None]:
y_train_pred_tree = tree.predict(X_train)
plot_confusion_matrix(tree,X_train,y_train, cmap='Blues')
plt.grid(False)

In [None]:
print(classification_report(y_train, y_train_pred_tree))

In [None]:
Precisions_linkedin.update({
    "Decision Tree": [0.03, 0.04, 0.84, 0.47],
})

### 4.3.5 Random Forest

In [None]:
tree_depth = [5, 10, 20]
for i in tree_depth:
    rf = RandomForestClassifier(max_depth=i)
    rf.fit(X_train, y_train)
    print('Max tree depth: ', i)
    print('Train results: ', classification_report(y_train, rf.predict(X_train)))
    print('Test results: ',classification_report(y_test, rf.predict(X_test)))

In [None]:
feature_scores = pd.Series(rf.feature_importances_, index=X_train.columns).sort_values(ascending=False)
feature_scores

In [None]:
# Calculate the predicted probabilities for the positive class (class 1)
y_pred_prob_rf = rf.predict_proba(X_test)[:, 1]

# Calculate the AUC score for the RandomForestClassifier
auc_score_rf = roc_auc_score(y_test, y_pred_prob_rf)
print('AUC Score for Random Forest:', auc_score_rf)

In [None]:
Precisions_linkedin.update({
    "Random Forest": [0.00, 0.00, 0.91, 0.48],
})

### 4.3.5 XGBoost

In [None]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

In [None]:
importance = plot_importance(xgb, height=0.9, max_num_features=10)
plt.show()

In [None]:
# Calculate the predicted probabilities for the positive class (class 1)
y_pred_prob_xgb = xgb.predict_proba(X_test)[:, 1]

# Calculate the AUC score for the XGBoost classifier
auc_score_xgb = roc_auc_score(y_test, y_pred_prob_xgb)
print('AUC Score for XGBoost:', auc_score_xgb)

In [None]:
print('Train results: ', classification_report(y_train, xgb.predict(X_train)))
print('Test results: ',classification_report(y_test, xgb.predict(X_test)))

In [None]:
Precisions_linkedin.update({
    "XGBoost": [0.0, 0.0, 0.87, 0.45],
})

## 4.4 Running modells on df_comb

### 4.4.1 Splitting between train and test data

In [None]:
# Convert the index values to integers
df_com.index = df_com.index.astype(int)

# Splitting the data into training and test data
train_df = df_com[df_com.index < 2018]   # All years before 2018 will be used as training data
test_df = df_com[df_com.index == 2018]   # Data for the year 2018 will be used as test data

# Defining the features
features = ['Market Capitalization', 'EBIT', 'Net Income', 'Total Equity', 'Total Assets', 'Net Debt', 'Gross Profit', 'Total Employees', 'Total Revenue','Employee development', 'Number of notices', 'Migrating work experience', 'Number of New Joiners', 'New joining work experience', 'Fluctuation rate',]  # Here you would specify the desired features

# Splitting the features and the target variable
X_train_pre = train_df[features] #training data from all properties that are not the target column (80%).
y_train_pre = train_df["Downgrade"] #Training data from the target variable (80%)
X_test = test_df[features] #analog X_train, but only 20%.
y_test = test_df["Downgrade"] #analog Y_train, but only 20%.

In [None]:
X_train_pre.shape

### 4.4.2 Oversampling of train data

In [None]:
# Erstellen Sie eine Instanz der ADASYN-Klasse
adasyn = ADASYN(random_state=42)

# Anwenden von ADASYN, um synthetische Daten zu generieren
X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train_pre, y_train_pre)

# Sie können auch die generierten numpy-Arrays wieder in Dataframes umwandeln, falls erforderlich
X_train = pd.DataFrame(X_train_adasyn, columns=X_train_pre.columns)
y_train = pd.Series(y_train_adasyn, name=y_train_pre.name)

In [None]:
temp1 = pd.DataFrame(y_train_pre)
temp2 = pd.DataFrame(y_train)

print('Before SMOTE')
print(temp1['Downgrade'].value_counts())
print('After SMOTE')
print(temp2['Downgrade'].value_counts())

In [None]:
temp3 = pd.DataFrame(y_test)

print('Check for test data')
print(temp3['Downgrade'].value_counts())

In [None]:
X_test.to_csv(r'C:\Users\chiar\OneDrive\Masterthesis\Modell\Dataframes\Downgrade\Com\x_test_com.csv', index=False)
X_train.to_csv(r'C:\Users\chiar\OneDrive\Masterthesis\Modell\Dataframes\Downgrade\Com\x_train_com.csv', index=False)
y_test.to_csv(r'C:\Users\chiar\OneDrive\Masterthesis\Modell\Dataframes\Downgrade\Com\y_test_com.csv', index=False)
y_train.to_csv(r'C:\Users\chiar\OneDrive\Masterthesis\Modell\Dataframes\Downgrade\Com\y_train_com.csv', index=False)

### 4.4.3  Logistische Regression

In [None]:
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)
warnings.filterwarnings("ignore")

In [None]:
y_pred = classifier.predict(X_test)
# Here the X-target variable is compared with the predicted values
cm = confusion_matrix(y_test, y_pred)
 
print ("Confusion Matrix : \n", cm)

In [None]:
#Visualization
plot_confusion_matrix(classifier,X_test,y_test,cmap='Blues')
plt.grid(False)

In [None]:
tn, fp, fn, tp = cm.ravel()
recall = tp/(fn+tp)
precision = tp/(tp+fp)
print("True Negatives: " + str(tn))
print("False Positives: " + str(fp))
print("False Negatives: " + str(fn))
print("True Positives: " + str(tp))
print("Recall: " + str(recall))
print("Precision: " + str(precision))

In [None]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, y_pred)
print ("Accuracy : ", acc)

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
# Calculate the predicted probabilities for the positive class (class 1)
y_pred_prob = classifier.predict_proba(X_test)[:, 1]

# Calculate the AUC score
auc_score = roc_auc_score(y_test, y_pred_prob)
print("AUC Score:", auc_score)

In [None]:
y_train_pred = classifier.predict(X_train)
# Comparison and results check 
print(classification_report(y_train,y_train_pred))

In [None]:
Precisions_com= {
    "Logistic Regression": [0.06, 0.22, 0.73, 0.46],}
# first test, then train

### 4.4.4 Decision Tree

In [None]:
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)

In [None]:
y_pred_tree = tree.predict(X_test)
plot_confusion_matrix(tree,X_test,y_test,cmap='Blues')
plt.grid(False)

In [None]:
acc = accuracy_score(y_test, y_pred_tree)
print ("Accuracy : ", acc)

In [None]:
print(classification_report(y_test, y_pred_tree))

In [None]:
y_train_pred_tree = tree.predict(X_train)
plot_confusion_matrix(tree,X_train,y_train, cmap='Blues')
plt.grid(False)

In [None]:
# Calculate the predicted probabilities for the positive class (class 1)
y_pred_prob_tree = tree.predict_proba(X_test)[:, 1]

# Calculate the AUC score for the Decision Tree classifier
auc_score_tree = roc_auc_score(y_test, y_pred_prob_tree)
print("AUC Score for Decision Tree:", auc_score_tree)

In [None]:
print(classification_report(y_train, y_train_pred_tree))

In [None]:
Precisions_com.update({
    "Decision Tree": [0.17, 0.13, 0.89, 0.54],
})

### 4.4.5 Random Forest

In [None]:
tree_depth = [5, 10, 20]
for i in tree_depth:
    rf = RandomForestClassifier(max_depth=i)
    rf.fit(X_train, y_train)
    print('Max tree depth: ', i)
    print('Train results: ', classification_report(y_train, rf.predict(X_train)))
    print('Test results: ',classification_report(y_test, rf.predict(X_test)))

In [None]:
feature_scores = pd.Series(rf.feature_importances_, index=X_train.columns).sort_values(ascending=False)
feature_scores

In [None]:
# Calculate the predicted probabilities for the positive class (class 1)
y_pred_prob_rf = rf.predict_proba(X_test)[:, 1]

# Calculate the AUC score for the RandomForestClassifier
auc_score_rf = roc_auc_score(y_test, y_pred_prob_rf)
print('AUC Score for Random Forest:', auc_score_rf)

In [None]:
Precisions_com.update({
    "Random Forest": [0.25, 0.04, 0.93, 0.50],
})

### 4.4.6 XGBoost 

In [None]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

In [None]:
importance = plot_importance(xgb, height=0.9, max_num_features=10)
plt.show()

In [None]:
print('Train results: ', classification_report(y_train, xgb.predict(X_train)))
print('Test results: ',classification_report(y_test, xgb.predict(X_test)))

In [None]:
# Calculate the predicted probabilities for the positive class (class 1)
y_pred_prob_xgb = xgb.predict_proba(X_test)[:, 1]

# Calculate the AUC score for the XGBoost classifier
auc_score_xgb = roc_auc_score(y_test, y_pred_prob_xgb)
print('AUC Score for XGBoost:', auc_score_xgb)

In [None]:
Precisions_com.update({
    "XGBoost": [0.36, 0.17, 0.92 ,0.59],
})

# 5. Evaluating and comparing Modells

The results are compiled and neatly presented.

## 5.1 Results financial data

In [None]:
headers = ["", "Precision (1)", "Recall (1)", "Accuracy", "AUC"]
table = PrettyTable()
table.field_names = headers

# List to store the maximum values in each column
max_values = [0.0] * len(headers)

best_model_auc = ""
max_auc_value = 0.0

for model, metrics in Precisions_financials.items():
    precision_test_data = metrics[0]
    precision_train_data = metrics[1]
    accuracy = metrics[2]
    auc = metrics[3]

    # Update the maximum values for each column
    max_values[1] = max(max_values[1], precision_test_data)
    max_values[2] = max(max_values[2], precision_train_data)
    max_values[3] = max(max_values[3], accuracy)
    max_values[4] = max(max_values[4], auc)

    # Update the best model based on the highest AUC score
    if auc > max_auc_value:
        max_auc_value = auc
        best_model_auc = model

    # Add a row to the table
    table.add_row([model, precision_test_data, precision_train_data, accuracy, auc])

# Mark the highest value in each column in red
for row in table._rows:
    for i in range(1, len(headers)):
        if row[i] == max_values[i]:
            row[i] = f"\033[31m{row[i]}\033[0m"  # Red color for the highest value

# Print the table with the highest values in each column marked in red
print(table)

# Print the "Best model" message in red
print(f"\033[31mBest model based on AUC: {best_model_auc}\033[0m")


## 5.2 Results linkedin data

In [None]:
headers = ["", "Precision (1)", "Recall (1)", "Accuracy", "AUC"]
table = PrettyTable()
table.field_names = headers

# List to store the maximum values in each column
max_values = [0.0] * len(headers)

best_model_auc = ""
max_auc_value = 0.0

for model, metrics in Precisions_linkedin.items():
    precision_test_data = metrics[0]
    precision_train_data = metrics[1]
    accuracy = metrics[2]
    auc = metrics[3]

    # Update the maximum values for each column
    max_values[1] = max(max_values[1], precision_test_data)
    max_values[2] = max(max_values[2], precision_train_data)
    max_values[3] = max(max_values[3], accuracy)
    max_values[4] = max(max_values[4], auc)

    # Update the best model based on the highest AUC score
    if auc > max_auc_value:
        max_auc_value = auc
        best_model_auc = model

    # Add a row to the table
    table.add_row([model, precision_test_data, precision_train_data, accuracy, auc])

# Mark the highest value in each column in red
for row in table._rows:
    for i in range(1, len(headers)):
        if row[i] == max_values[i]:
            row[i] = f"\033[31m{row[i]}\033[0m"  # Red color for the highest value

# Print the table with the highest values in each column marked in red
print(table)

# Print the "Best model" message in red
print(f"\033[31mBest model based on AUC: {best_model_auc}\033[0m")


## 5.3 Results combined data

In [None]:
headers = ["", "Precision (1)", "Recall (1)", "Accuracy", "AUC"]
table = PrettyTable()
table.field_names = headers

# List to store the maximum values in each column
max_values = [0.0] * len(headers)

best_model_auc = ""
max_auc_value = 0.0

for model, metrics in Precisions_com.items():
    precision_test_data = metrics[0]
    precision_train_data = metrics[1]
    accuracy = metrics[2]
    auc = metrics[3]

    # Update the maximum values for each column
    max_values[1] = max(max_values[1], precision_test_data)
    max_values[2] = max(max_values[2], precision_train_data)
    max_values[3] = max(max_values[3], accuracy)
    max_values[4] = max(max_values[4], auc)

    # Update the best model based on the highest AUC score
    if auc > max_auc_value:
        max_auc_value = auc
        best_model_auc = model

    # Add a row to the table
    table.add_row([model, precision_test_data, precision_train_data, accuracy, auc])

# Mark the highest value in each column in red
for row in table._rows:
    for i in range(1, len(headers)):
        if row[i] == max_values[i]:
            row[i] = f"\033[31m{row[i]}\033[0m"  # Red color for the highest value

# Print the table with the highest values in each column marked in red
print(table)

# Print the "Best model" message in red
print(f"\033[31mBest model based on AUC: {best_model_auc}\033[0m")