In [44]:
# Dependencies
import pandas as pd
from pathlib import Path

In [45]:
# Load the data into a Pandas DataFrame
df = pd.read_csv("clean_data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Name in English,Countries,Degree of endangerment,Number of speakers,Latitude,Longitude,Description of the location
0,0,South Italian,Italy,Vulnerable,7500000.0,40.9798,15.249,"Campania, Lucania (Basilicata), Abruzzi (Abruz..."
1,1,Sicilian,Italy,Vulnerable,5000000.0,37.4399,14.5019,"Sicily (Sicilia), southern and central Calabri..."
2,2,Low Saxon,"Germany, Denmark, Netherlands, Poland, Russian...",Vulnerable,4800000.0,53.4029,10.3601,"northern Germany, the north-eastern part of th..."
3,3,Belarusian,"Belarus, Latvia, Lithuania, Poland, Russian Fe...",Vulnerable,4000000.0,53.956,27.5756,Belarus except the Polesian-speaking south-wes...
4,4,Lombard,"Italy, Switzerland",Definitely endangered,3500000.0,45.7215,9.3273,the region of Lombardy (except the southernmos...


In [46]:
df.columns

Index(['Unnamed: 0', 'Name in English', 'Countries', 'Degree of endangerment',
       'Number of speakers', 'Latitude', 'Longitude',
       'Description of the location'],
      dtype='object')

In [47]:
df_named = pd.DataFrame(df)

In [48]:
df_languages = df_named.rename(columns={'Unnamed: 0': 'ID'})

In [49]:
df_languages.columns

Index(['ID', 'Name in English', 'Countries', 'Degree of endangerment',
       'Number of speakers', 'Latitude', 'Longitude',
       'Description of the location'],
      dtype='object')

# Objective 2: Classifying Languages at Risk of Extinction (Machine Learning)

In [50]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor
import xgboost as xgb

In [51]:
# Step 2: Data Preprocessing
# Fill in any missing values if necessary (you can fill with mean, median, etc.)
df_languages.fillna(method='ffill', inplace=True)  # Forward fill for simplicity

  df_languages.fillna(method='ffill', inplace=True)  # Forward fill for simplicity


In [52]:
# Step 3: Encode target variable 'Degree of endangerment'
label_encoder = LabelEncoder()
df_languages['Degree of endangerment'] = label_encoder.fit_transform(df_languages['Degree of endangerment'])

In [53]:
# Step 4: Define Features (X) and Target (y)
X = df_languages[['Number of speakers', 'Latitude', 'Longitude']]
# Add additional features if available, e.g., internet usage, government policies
# X = df[['Number of speakers', 'Latitude', 'Longitude', 'Internet usage', 'Government policies']]
y = df_languages['Degree of endangerment']  # This assumes it's encoded as integers

In [54]:
# Step 5: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [55]:
# Step 6: Train a Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [56]:
# Step 7: Make predictions
y_pred = rf_model.predict(X_test)

In [57]:
# Step 8: Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[87  4  7 21  2]
 [ 2 81  7 29 37]
 [ 3  2 39  2  1]
 [21 22  1 46 11]
 [ 0 32  3  5 80]]

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.72      0.74       121
           1       0.57      0.52      0.55       156
           2       0.68      0.83      0.75        47
           3       0.45      0.46      0.45       101
           4       0.61      0.67      0.64       120

    accuracy                           0.61       545
   macro avg       0.62      0.64      0.63       545
weighted avg       0.61      0.61      0.61       545



In [58]:
# Step 9: Feature Importance
importances = rf_model.feature_importances_
features = X.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print("\nFeature Importances:")
print(importance_df)


Feature Importances:
              Feature  Importance
0  Number of speakers    0.402672
2           Longitude    0.303509
1            Latitude    0.293819


In [59]:
# Optional: Train an XGBoost model for comparison
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

print("\nXGBoost Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_xgb))
print("\nXGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb))


XGBoost Confusion Matrix:
[[78  9  8 25  1]
 [ 6 87  4 26 33]
 [ 1  3 40  2  1]
 [23 17  0 47 14]
 [ 4 32  2 10 72]]

XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.64      0.67       121
           1       0.59      0.56      0.57       156
           2       0.74      0.85      0.79        47
           3       0.43      0.47      0.45       101
           4       0.60      0.60      0.60       120

    accuracy                           0.59       545
   macro avg       0.61      0.62      0.62       545
weighted avg       0.60      0.59      0.59       545



Parameters: { "use_label_encoder" } are not used.



# Objective 3: Predicting Language Growth or Decline


In [60]:
df_languages.dtypes

ID                               int64
Name in English                 object
Countries                       object
Degree of endangerment           int64
Number of speakers             float64
Latitude                       float64
Longitude                      float64
Description of the location     object
dtype: object

In [61]:
!pip install pandas scikit-learn xgboost



In [62]:
# Step 3: Define Features (X) and Target (y)
# Here we specify current number of speakers, degree of endangerment, and we can add more features as needed.
X = df[['Degree of endangerment', 'Number of speakers']]  # Add more features as per your merged data
y = df['Number of speakers']  # This would usually be the future count to predict

In [63]:
# Step 4: If you want to simulate future speakers, you may need to shift your target variable:
# Create a dummy feature or use your external data for projections
# y = df['Future number of speakers'] (this requires historical data)

In [64]:
# Step 5: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [65]:
# Step 6: Standardize Features (optional but recommended)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

ValueError: could not convert string to float: 'Extinct'

In [None]:
# Step 7: Train an XGBoost Regressor
model = XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
model.fit(X_train, y_train)

ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:Degree of endangerment: object