# Data Collection, reading and Exploring the data
## This process covers the cleaning up the data into a form that can be used for the Machine Learning models

In [294]:
# Dependencies
import pandas as pd
from pathlib import Path

In [295]:
# Load the data into a Pandas DataFrame
df = pd.read_csv("clean_data.csv")
df.head(10)

Unnamed: 0.1,Unnamed: 0,Name in English,Countries,Degree of endangerment,Number of speakers,Latitude,Longitude,Description of the location
0,0,South Italian,Italy,Vulnerable,7500000.0,40.9798,15.249,"Campania, Lucania (Basilicata), Abruzzi (Abruz..."
1,1,Sicilian,Italy,Vulnerable,5000000.0,37.4399,14.5019,"Sicily (Sicilia), southern and central Calabri..."
2,2,Low Saxon,"Germany, Denmark, Netherlands, Poland, Russian...",Vulnerable,4800000.0,53.4029,10.3601,"northern Germany, the north-eastern part of th..."
3,3,Belarusian,"Belarus, Latvia, Lithuania, Poland, Russian Fe...",Vulnerable,4000000.0,53.956,27.5756,Belarus except the Polesian-speaking south-wes...
4,4,Lombard,"Italy, Switzerland",Definitely endangered,3500000.0,45.7215,9.3273,the region of Lombardy (except the southernmos...
5,5,Romani,"Albania, Germany, Austria, Belarus, Bosnia and...",Definitely endangered,3500000.0,46.3165,22.3681,"many European countries, most densely in East-..."
6,6,Yiddish (Israel),Israel,Definitely endangered,3000000.0,32.0833,34.8333,"most speakers now live outside Europe, mainly ..."
7,7,Gondi,India,Vulnerable,2713790.0,19.5804,80.4418,"Madhya Pradesh (Betul, Chhindwara, Seoni, Mand..."
8,8,Limburgian-Ripuarian,"Germany, Belgium, Netherlands",Vulnerable,2600000.0,50.7781,6.0864,Limburg provinces in Belgium and the Netherlan...
9,9,Quechua of Southern Bolivia,Bolivia (Plurinational State of),Vulnerable,2300000.0,-18.0675,-65.7641,"Departments of Cochabamba, Chuquisaca, Potosí ..."


In [296]:
df_named = pd.DataFrame(df)

In [297]:
df_named

Unnamed: 0.1,Unnamed: 0,Name in English,Countries,Degree of endangerment,Number of speakers,Latitude,Longitude,Description of the location
0,0,South Italian,Italy,Vulnerable,7500000.0,40.9798,15.2490,"Campania, Lucania (Basilicata), Abruzzi (Abruz..."
1,1,Sicilian,Italy,Vulnerable,5000000.0,37.4399,14.5019,"Sicily (Sicilia), southern and central Calabri..."
2,2,Low Saxon,"Germany, Denmark, Netherlands, Poland, Russian...",Vulnerable,4800000.0,53.4029,10.3601,"northern Germany, the north-eastern part of th..."
3,3,Belarusian,"Belarus, Latvia, Lithuania, Poland, Russian Fe...",Vulnerable,4000000.0,53.9560,27.5756,Belarus except the Polesian-speaking south-wes...
4,4,Lombard,"Italy, Switzerland",Definitely endangered,3500000.0,45.7215,9.3273,the region of Lombardy (except the southernmos...
...,...,...,...,...,...,...,...,...
2717,2717,Yonaguni,Japan,Severely endangered,,24.4580,122.9802,Yonaguni Island
2718,2718,Yucuna,Colombia,Definitely endangered,,-1.2633,-70.6640,
2719,2719,Yurutí (Colombia),Colombia,Definitely endangered,,1.1370,-70.2136,
2720,2720,Zangskari,"India, Pakistan",Definitely endangered,,34.0833,76.8500,


In [298]:
# Step 2: Data Preprocessing
# Fill in any missing values if necessary (you can fill with mean, median, etc.)
df_named.fillna(method='ffill', inplace=True)  # Forward fill for simplicity

  df_named.fillna(method='ffill', inplace=True)  # Forward fill for simplicity


In [299]:
df_lan_df = pd.DataFrame(df_named)

In [300]:
df_languages = df_named.rename(columns={'Unnamed: 0': 'ID'})

In [301]:
# df_languages.dtypes

In [302]:
# Identify unique values
DoE = df_languages['Degree of endangerment'].unique()

print(DoE)

['Vulnerable' 'Definitely endangered' 'Severely endangered'
 'Critically endangered' 'Extinct']


In [303]:
# Critically endangered = 0; Vulnerable = 4; Severely endangered = 3; Definitely endangered = 1; Extinct = 2

In [304]:
classification = {
    1: 'Critically endangered',
    3: 'Definitely endangered',
    0: 'Extinct',
    2: 'Severely endangered',
    4: 'Vulnerable'
}

# Create a new dataframe using the classification
classification_df = pd.DataFrame(list(classification.items()), columns=['Code', 'Degree of endangerment'])
classification_df

Unnamed: 0,Code,Degree of endangerment
0,1,Critically endangered
1,3,Definitely endangered
2,0,Extinct
3,2,Severely endangered
4,4,Vulnerable


In [305]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the 'Degree of endangerment' column
df_languages['Degree of endangerment'] = label_encoder.fit_transform(df_languages['Degree of endangerment'])

# Display the updated dataframe
df_languages.head()

Unnamed: 0,ID,Name in English,Countries,Degree of endangerment,Number of speakers,Latitude,Longitude,Description of the location
0,0,South Italian,Italy,4,7500000.0,40.9798,15.249,"Campania, Lucania (Basilicata), Abruzzi (Abruz..."
1,1,Sicilian,Italy,4,5000000.0,37.4399,14.5019,"Sicily (Sicilia), southern and central Calabri..."
2,2,Low Saxon,"Germany, Denmark, Netherlands, Poland, Russian...",4,4800000.0,53.4029,10.3601,"northern Germany, the north-eastern part of th..."
3,3,Belarusian,"Belarus, Latvia, Lithuania, Poland, Russian Fe...",4,4000000.0,53.956,27.5756,Belarus except the Polesian-speaking south-wes...
4,4,Lombard,"Italy, Switzerland",1,3500000.0,45.7215,9.3273,the region of Lombardy (except the southernmos...


In [306]:
df_languages.head()

Unnamed: 0,ID,Name in English,Countries,Degree of endangerment,Number of speakers,Latitude,Longitude,Description of the location
0,0,South Italian,Italy,4,7500000.0,40.9798,15.249,"Campania, Lucania (Basilicata), Abruzzi (Abruz..."
1,1,Sicilian,Italy,4,5000000.0,37.4399,14.5019,"Sicily (Sicilia), southern and central Calabri..."
2,2,Low Saxon,"Germany, Denmark, Netherlands, Poland, Russian...",4,4800000.0,53.4029,10.3601,"northern Germany, the north-eastern part of th..."
3,3,Belarusian,"Belarus, Latvia, Lithuania, Poland, Russian Fe...",4,4000000.0,53.956,27.5756,Belarus except the Polesian-speaking south-wes...
4,4,Lombard,"Italy, Switzerland",1,3500000.0,45.7215,9.3273,the region of Lombardy (except the southernmos...


In [307]:
# Drop the Countries column in the df_languages dataframe
# df_languages.drop(columns=['Countries'], inplace=True)
columns = df_languages.columns.tolist()
# columns

In [308]:
countries = df_languages['Countries'].tolist()
# countries

In [309]:
# create a variable to store the number of countries
# num_countries = len(countries[2].split(", "))
num_countries = [len(c.split(", ")) for c in countries]
# num_countries

In [310]:
df_languages['Countries'] = num_countries
df_languages.head()

Unnamed: 0,ID,Name in English,Countries,Degree of endangerment,Number of speakers,Latitude,Longitude,Description of the location
0,0,South Italian,1,4,7500000.0,40.9798,15.249,"Campania, Lucania (Basilicata), Abruzzi (Abruz..."
1,1,Sicilian,1,4,5000000.0,37.4399,14.5019,"Sicily (Sicilia), southern and central Calabri..."
2,2,Low Saxon,5,4,4800000.0,53.4029,10.3601,"northern Germany, the north-eastern part of th..."
3,3,Belarusian,6,4,4000000.0,53.956,27.5756,Belarus except the Polesian-speaking south-wes...
4,4,Lombard,2,1,3500000.0,45.7215,9.3273,the region of Lombardy (except the southernmos...


In [311]:
# Replace the 'Name in English' to 'language' in the pandas dataframe 
df_languages.rename(columns={'Name in English': 'Language'}, inplace=True)

In [312]:
# Drop the Countries column in the df_languages dataframe
df_languages.drop(columns=['Description of the location'], inplace=True)

In [313]:
df_languages

Unnamed: 0,ID,Language,Countries,Degree of endangerment,Number of speakers,Latitude,Longitude
0,0,South Italian,1,4,7500000.0,40.9798,15.2490
1,1,Sicilian,1,4,5000000.0,37.4399,14.5019
2,2,Low Saxon,5,4,4800000.0,53.4029,10.3601
3,3,Belarusian,6,4,4000000.0,53.9560,27.5756
4,4,Lombard,2,1,3500000.0,45.7215,9.3273
...,...,...,...,...,...,...,...
2717,2717,Yonaguni,1,3,0.0,24.4580,122.9802
2718,2718,Yucuna,1,1,0.0,-1.2633,-70.6640
2719,2719,Yurutí (Colombia),1,1,0.0,1.1370,-70.2136
2720,2720,Zangskari,2,1,0.0,34.0833,76.8500


In [314]:
nan_counts = df_languages.isna().sum()
print(nan_counts)

ID                        0
Language                  0
Countries                 0
Degree of endangerment    0
Number of speakers        0
Latitude                  0
Longitude                 0
dtype: int64


# Step 2: Supervised Learning  - Classifying Languages at Risk of Extinction (Machine Learning)
## Using 2 models/algorithms for classification: XGBoost and Logistic Regression 

In [319]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor
import xgboost as xgb

In [316]:
# Step 3: Encode target variable 'Degree of endangerment'
# label_encoder = LabelEncoder()
# df_languages['Degree of endangerment'] = label_encoder.fit_transform(df_languages['Degree of endangerment'])
df_languages #. merge(classification_df, on='Degree of endangerment')

Unnamed: 0,ID,Language,Countries,Degree of endangerment,Number of speakers,Latitude,Longitude
0,0,South Italian,1,4,7500000.0,40.9798,15.2490
1,1,Sicilian,1,4,5000000.0,37.4399,14.5019
2,2,Low Saxon,5,4,4800000.0,53.4029,10.3601
3,3,Belarusian,6,4,4000000.0,53.9560,27.5756
4,4,Lombard,2,1,3500000.0,45.7215,9.3273
...,...,...,...,...,...,...,...
2717,2717,Yonaguni,1,3,0.0,24.4580,122.9802
2718,2718,Yucuna,1,1,0.0,-1.2633,-70.6640
2719,2719,Yurutí (Colombia),1,1,0.0,1.1370,-70.2136
2720,2720,Zangskari,2,1,0.0,34.0833,76.8500


In [None]:
# # One-hot encoding for categorical variables
# encoder = OneHotEncoder(sparse_output=False)
# encoded_columns = encoder.fit_transform(df_languages[['Language']])
# # Create a DataFrame for encoded columns
# encoded_df = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out(['Language']))

# # Combine the encoded columns with the original numerical data
# df_final = pd.concat([df_languages[['Number of speakers']], encoded_df], axis=1)


In [341]:
# Step 4: Define Features (X) and Target (y)
X = df_languages[['Number of speakers', 'Countries']] # Target variable

# X = df[['Number of speakers', 'Latitude', 'Longitude', 'Internet usage', 'Government policies']]
y = df_languages['Degree of endangerment']  # This assumes it's encoded as integers

In [342]:
# Step 5: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [343]:
# Step 6: Train a Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)


In [344]:
# Step 7: Make predictions
y_pred = rf_model.predict(X_test)

In [None]:
# Step 8: Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[75 10 18 15  3]
 [11 64 13 28 40]
 [ 0  0 46  0  1]
 [16 29  4 31 21]
 [ 8 46  4 23 39]]

Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.62      0.65       121
           1       0.43      0.41      0.42       156
           2       0.54      0.98      0.70        47
           3       0.32      0.31      0.31       101
           4       0.38      0.33      0.35       120

    accuracy                           0.47       545
   macro avg       0.47      0.53      0.49       545
weighted avg       0.46      0.47      0.46       545



In [346]:
# Step 9: Feature Importance
importances = rf_model.feature_importances_
features = X.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print("\nFeature Importances:")
print(importance_df)


Feature Importances:
              Feature  Importance
0  Number of speakers    0.972209
1           Countries    0.027791


In [None]:
# Train an XGBoost model for comparison
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

print("\nXGBoost Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_xgb))
print("\nXGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb))

Parameters: { "use_label_encoder" } are not used.




XGBoost Confusion Matrix:
[[80  9 18 12  2]
 [ 2 70 13 22 49]
 [ 0  0 46  0  1]
 [18 26  4 24 29]
 [ 6 43  4 20 47]]

XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.66      0.70       121
           1       0.47      0.45      0.46       156
           2       0.54      0.98      0.70        47
           3       0.31      0.24      0.27       101
           4       0.37      0.39      0.38       120

    accuracy                           0.49       545
   macro avg       0.49      0.54      0.50       545
weighted avg       0.49      0.49      0.48       545

