In [78]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

In [34]:
df_2023 = pd.read_csv("Elcat_data.csv")
df_2024 = pd.read_csv("Kaggle_data.csv")

# Display the first few rows
print("2023 Data Sample:")
print(df_2023.head())

print("\n2024 Data Sample:")
print(df_2024.head())

2023 Data Sample:
   Unnamed: 0 ISO639-3 codes Name in English  \
0        3645            knw            !Xun   
1        3956            bpk           'Ôrôê   
2        1933            taa  (Lower) Tanana   
3        1043            con         A'ingae   
4        3581            aas           Aasáx   

                                     Alternate names  \
0  Ju; !Xun (Ekoka); Kung-Ekoka; !Kung; Ekoka-!Xû...   
1                       Orowe; Boewe; Neukaledonien;   
2                                                NaN   
3  Kofane; Cofán; Kofán; A'i; A'ingaé; Colin; Kof...   
4  Asax; Asá; Aasá; Assa; Asak; "Ndorobo"; "Dorob...   

                              Degree of endangerment  \
0  Vulnerable (20 percent certain, based on the e...   
1  Endangered (20 percent certain, based on the e...   
2  Critically Endangered (80 percent certain, bas...   
3  Vulnerable (100 percent certain, based on the ...   
4                                           Dormant    

  Number of speaker

In [35]:
print("Missing Values in 2023 Dataset:")
print(df_2023.isnull().sum())

print("\nMissing Values in 2024 Dataset:")
print(df_2024.isnull().sum())

Missing Values in 2023 Dataset:
Unnamed: 0                      0
ISO639-3 codes                 73
Name in English                 0
Alternate names               438
Degree of endangerment        120
Number of speakers (2023)     318
Language Branch                11
Comments                     3188
Description of language      2827
Countries                      11
Continent                      11
Coordinates                   357
dtype: int64

Missing Values in 2024 Dataset:
ID                                0
Name in English                   0
Countries                         1
Country codes alpha 3             1
ISO639-3 codes                  264
Degree of endangerment            0
Alternate names                1139
Name in the language           2695
Number of speakers 2024         183
Sources                         643
Latitude                          3
Longitude                         3
Description of the location     852
dtype: int64


In [5]:
# Load CSV files
df_2023 = pd.read_csv("Elcat_data.csv")
df_2024 = pd.read_csv("Kaggle_data.csv")

def convert_speaker_count(value):
    if isinstance(value, str):  # Ensure it's a string
        value = value.replace(",", "").strip()  # Remove commas & spaces
        
        # Handle ranges (e.g., "14000-18000" → average)
        if "-" in value:
            try:
                low, high = map(int, value.split("-"))
                return (low + high) / 2  # Compute the average
            except ValueError:
                return np.nan  # Handle invalid range format

        # Handle less-than or greater-than symbols (e.g., "<500" → 500)
        elif "<" in value or ">" in value:
            try:
                return int(value.replace("<", "").replace(">", ""))
            except ValueError:
                return np.nan
        
        # Convert simple numbers
        elif value.isdigit():
            return int(value)


In [6]:
print(df_2023["Number of speakers (2023)"].dtype)
print(df_2024["Number of speakers 2024"].dtype)


object
float64


In [7]:
# Check which values are still non-numeric
non_numeric_values = df_2023[~df_2023["Number of speakers (2023)"].astype(str).str.replace(".", "", regex=True).str.isdigit()]

# Display problematic rows
print(non_numeric_values[["Name in English", "Number of speakers (2023)"]])


       Name in English Number of speakers (2023)
0                 !Xun             14,000-18,000
1                'Ôrôê                       590
2       (Lower) Tanana                        25
3              A'ingae                     1,500
4                Aasáx                         0
...                ...                       ...
3461            ||Gana                     1,030
3462  Łingít (Tlingit)                       200
3463             ǂHoan                       <40
3464      ǂKx'au||'ein                       NaN
3465    ᏣᎳᎩ (Cherokee)                      2100

[3466 rows x 2 columns]


In [8]:
def clean_speaker_count(value):
    if isinstance(value, str):  # Ensure it's a string
        value = value.replace(",", "").strip()  # Remove commas and extra spaces

        # Handle approximate values like "~2000000"
        if "~" in value:
            value = value.replace("~", "")  # Remove tilde symbol

        # Handle ranges like "14000-18000" → Convert to average
        if "-" in value:
            try:
                low, high = map(int, value.split("-"))
                return (low + high) / 2  # Compute the average
            except ValueError:
                return np.nan  # Handle invalid range format

        # Handle less-than or greater-than symbols (e.g., "<500" → 500)
        if "<" in value or ">" in value:
            value = re.sub(r"[<>]", "", value)  # Remove < and > symbols

        # Convert valid numbers
        if value.isdigit():
            return int(value)

    return np.nan  # Return NaN for unprocessable values

In [9]:
df_2023["Number of speakers (2023)"] = df_2023["Number of speakers (2023)"].astype(str).apply(clean_speaker_count)

# Fill missing values (e.g., NaN) with a dummy number (100 speakers)
df_2023["Number of speakers (2023)"].fillna(100, inplace=True)

# Convert the column to float
df_2023["Number of speakers (2023)"] = df_2023["Number of speakers (2023)"].astype(float)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_2023["Number of speakers (2023)"].fillna(100, inplace=True)


In [10]:
print(df_2023["Number of speakers (2023)"].dtype)  # Should print float64
print(df_2023["Number of speakers (2023)"].unique())  # Should show only numbers
print(df_2023["Number of speakers (2023)"].describe())  # Summary statistics


float64
[1.60000e+04 5.90000e+02 2.50000e+01 1.50000e+03 0.00000e+00 5.00000e+00
 5.00000e+02 7.00000e+03 4.78800e+04 3.50000e+03 5.00000e+01 1.50000e+01
 2.34700e+03 2.56000e+03 3.00000e+03 2.83000e+02 2.00000e+02 7.00000e+04
 5.15930e+04 5.44300e+03 1.00000e+01 4.00000e+03 3.50000e+01 1.00190e+05
 1.00000e+00 1.00000e+02 2.89000e+04 1.15000e+02 1.80000e+04 6.70000e+02
 5.50000e+04 3.47000e+03 9.68000e+02 3.00000e+02 3.00000e+01 1.87500e+02
 8.19000e+02 1.40000e+02 2.64000e+03 1.10000e+02 2.00000e+00 6.20000e+02
 2.50000e+02 7.92600e+03 8.00000e+01 5.57200e+03 2.00000e+03 5.00000e+03
 8.00000e+02 6.00000e+02 3.00000e+00 1.50000e+02 2.75000e+02 4.00000e+02
 1.50000e+04 1.00000e+04 1.20000e+03 1.70000e+01 5.50000e+03 2.50000e+04
 8.50000e+01 6.33000e+02 3.50000e+02 8.34000e+04 1.88000e+03 1.00000e+03
 1.20000e+04 4.00000e+04 1.66400e+03 4.50000e+02 3.88000e+02 3.70000e+04
 2.31000e+02 7.70000e+02 3.60000e+03 3.30000e+04 5.20000e+03 6.00000e+03
 2.30000e+02 4.50000e+01 7.33040e+05 1.0700

In [11]:
df_2023.head()

Unnamed: 0.1,Unnamed: 0,ISO639-3 codes,Name in English,Alternate names,Degree of endangerment,Number of speakers (2023),Language Branch,Comments,Description of language,Countries,Continent,Coordinates
0,3645,knw,!Xun,Ju; !Xun (Ekoka); Kung-Ekoka; !Kung; Ekoka-!Xû...,"Vulnerable (20 percent certain, based on the e...",16000.0,Kx'a,,,South Africa;Namibia;Angola;,Africa,"-28.74358,23.983154; -17.560247, 18.050537; -1..."
1,3956,bpk,'Ôrôê,Orowe; Boewe; Neukaledonien;,"Endangered (20 percent certain, based on the e...",590.0,Austronesian; Malayo-Polynesian; Oceanic; New ...,,,New Caledonia;,Pacific,"-21.4223,165.4678"
2,1933,taa,(Lower) Tanana,,"Critically Endangered (80 percent certain, bas...",25.0,Athabaskan-Eyak-Tlingit; Dene (Athabaskan),,Tanana is the language of the Lower Tanana riv...,USA;,North America,"65.157778, -149.37;64.521111, -146.980556;64.5..."
3,1043,con,A'ingae,Kofane; Cofán; Kofán; A'i; A'ingaé; Colin; Kof...,"Vulnerable (100 percent certain, based on the ...",1500.0,Isolate; South American,,,Colombia;Ecuador;,South America,"0.054639, -77.409417"
4,3581,aas,Aasáx,"Asax; Asá; Aasá; Assa; Asak; ""Ndorobo""; ""Dorob...",Dormant,0.0,Afro-Asiatic; Cushitic; South Cushitic,,,Tanzania;,Africa,"-5.1948,37.738"


In [12]:
# Merge datasets on ISO639-3 codes (ensuring a proper join)
df_combined = df_2023.merge(df_2024, on="ISO639-3 codes", suffixes=("_2023", "_2024"), how="inner")

# Calculate Speaker Change Percentage
df_combined["Speaker Change (%)"] = ((df_combined["Number of speakers 2024"] - df_combined["Number of speakers (2023)"]) 
                                     / df_combined["Number of speakers (2023)"]) * 100

# Drop rows where ISO codes are missing (though unlikely in a merged dataset)
df_combined.dropna(subset=["ISO639-3 codes"], inplace=True)


In [13]:
df_combined

Unnamed: 0.1,Unnamed: 0,ISO639-3 codes,Name in English_2023,Alternate names_2023,Degree of endangerment_2023,Number of speakers (2023),Language Branch,Comments,Description of language,Countries_2023,...,Country codes alpha 3,Degree of endangerment_2024,Alternate names_2024,Name in the language,Number of speakers 2024,Sources,Latitude,Longitude,Description of the location,Speaker Change (%)
0,3956,bpk,'Ôrôê,Orowe; Boewe; Neukaledonien;,"Endangered (20 percent certain, based on the e...",590.0,Austronesian; Malayo-Polynesian; Oceanic; New ...,,,New Caledonia;,...,NCL,Definitely endangered,abwébwé,,587.0,1996 Census,-21.4223,165.4678,Bourail,-0.508475
1,1933,taa,(Lower) Tanana,,"Critically Endangered (80 percent certain, bas...",25.0,Athabaskan-Eyak-Tlingit; Dene (Athabaskan),,Tanana is the language of the Lower Tanana riv...,USA;,...,USA,Critically endangered,Lower Tanana,,15.0,"Krauss 2007, \nGolla et al. ms.",64.6050,-149.0625,"villages of Minto and Nenana, was also spoken ...",-40.000000
2,1043,con,A'ingae,Kofane; Cofán; Kofán; A'i; A'ingaé; Colin; Kof...,"Vulnerable (100 percent certain, based on the ...",1500.0,Isolate; South American,,,Colombia;Ecuador;,...,ECU,Definitely endangered,"Cofan, Kofan (see Colombia)",,700.0,"- CODENPE - PRODEPINE, Diagnóstico Participati...",0.0659,-76.7065,"In Ecuador: Province of Sucumbíos, Lago Agrio ...",-53.333333
3,1043,con,A'ingae,Kofane; Cofán; Kofán; A'i; A'ingaé; Colin; Kof...,"Vulnerable (100 percent certain, based on the ...",1500.0,Isolate; South American,,,Colombia;Ecuador;,...,COL,Severely endangered,"Kofan, A'i, A'ingae, Kofane, Cofan",,379.0,Fundación Zio-A'I Unión de Sabiduría - Censo -...,0.4525,-76.9191,"Colombia-Ecuador border area. In Colombia, Dep...",-74.733333
4,3581,aas,Aasáx,"Asax; Asá; Aasá; Assa; Asak; ""Ndorobo""; ""Dorob...",Dormant,0.0,Afro-Asiatic; Cushitic; South Cushitic,,,Tanzania;,...,TZA,Extinct,Asa,,0.0,Maarten Mous,-5.1948,37.7380,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21121,850,tli,Łingít (Tlingit),Tlinkit; Thlinget; Inland Tlingit; Lingit; Kol...,"Critically Endangered (80 percent certain, bas...",200.0,Athabaskan-Eyak-Tlingit; Tlingit,,,USA;Canada;,...,USA,Critically endangered,Tlinkit,,300.0,Krauss 2007,58.3340,-133.7475,Tlingit area is the southeast Alaska coast fro...,50.000000
21122,850,tli,Łingít (Tlingit),Tlinkit; Thlinget; Inland Tlingit; Lingit; Kol...,"Critically Endangered (80 percent certain, bas...",200.0,Athabaskan-Eyak-Tlingit; Tlingit,,,USA;Canada;,...,CAN,Critically endangered,,,55.0,"1996 and 2001 Censuses, Statistics Canada; bas...",60.1710,-132.7395,Spread over 3 communities in Canada,-72.500000
21123,594,huc,ǂHoan,ǂHua; ǂHuan; ≠Hû; =|Hua; =|Hua-Owani; |Hua; |H...,"Severely Endangered (60 percent certain, based...",40.0,Kx'a,,,Botswana;,...,BWA,Critically endangered,"ǂHõã, Eastern ǂHõã",,140.0,Kemmonye Monaka (p.c.),-24.1668,24.4885,Pointer in geographic centre of settlement area.,250.000000
21124,1684,chr,ᏣᎳᎩ (Cherokee),Iroquois; Tsalagi; Tslagi; Rickohockan; Rechah...,"Threatened (20 percent certain, based on the e...",2100.0,Iroquoian,,,USA;,...,USA,Definitely endangered,,,10000.0,"Victor Golla, Ives Goddard, Lyle Campbell, Mar...",36.7544,-98.3569,"Cherokee Nation, OKlahoma, and United Keetoowa...",376.190476


In [14]:
print(df_combined[["Name in English_2023", "Speaker Change (%)"]].sort_values(by="Speaker Change (%)", ascending=False).head(10))

       Name in English_2023  Speaker Change (%)
9722           Kiowa Apache                 inf
10896                  Leco                 inf
11452                  Loun                 inf
18449             Tequiraca                 inf
605    Arabana-Wangkangurru                 inf
20496               Wichita                 inf
1170                   Auré                 inf
5823             Holikachuk                 inf
19622                   Uru                 inf
18742                Tirahi                 inf


In [28]:
# Set a dynamic threshold: Replace 0 with a fraction of the next year's count
df_combined["Number of speakers (2023)"] = np.where(df_combined["Number of speakers (2023)"] == 0, 
                                                    df_combined["Number of speakers 2024"] * 0.1,  # Use 10% of 2024 count
                                                    df_combined["Number of speakers (2023)"])

# Recalculate Speaker Change Percentage
df_combined["Speaker Change (%)"] = ((df_combined["Number of speakers 2024"] - df_combined["Number of speakers (2023)"]) 
                                     / df_combined["Number of speakers (2023)"]) * 100

# Cap extreme values (e.g., set max at 500%)
df_combined["Speaker Change (%)"] = df_combined["Speaker Change (%)"].clip(upper=100)

# Also calculate absolute change
df_combined["Speaker Change (absolute)"] = df_combined["Number of speakers 2024"] - df_combined["Number of speakers (2023)"]

# View top growing languages again
print(df_combined[["Name in English_2023", "Speaker Change (%)"]].sort_values(by="Speaker Change (%)", ascending=False).head(10))


      Name in English_2023  Speaker Change (%)
5817              Hla'alua               100.0
15742               Ottawa               100.0
18459           Texistepec               100.0
11455               Ludian               100.0
3836                Dusner               100.0
18460           Thado Chin               100.0
18465                 Thao               100.0
18466              Thavung               100.0
18733              Thulung               100.0
11728          Machiguenga               100.0


In [29]:
df_combined

Unnamed: 0.1,Unnamed: 0,ISO639-3 codes,Name in English_2023,Alternate names_2023,Degree of endangerment_2023,Number of speakers (2023),Language Branch,Comments,Description of language,Countries_2023,...,Degree of endangerment_2024,Alternate names_2024,Name in the language,Number of speakers 2024,Sources,Latitude,Longitude,Description of the location,Speaker Change (%),Speaker Change (absolute)
0,3956,bpk,'Ôrôê,Orowe; Boewe; Neukaledonien;,"Endangered (20 percent certain, based on the e...",590.0,Austronesian; Malayo-Polynesian; Oceanic; New ...,,,New Caledonia;,...,Definitely endangered,abwébwé,,587.0,1996 Census,-21.4223,165.4678,Bourail,-0.508475,-3.0
1,1933,taa,(Lower) Tanana,,"Critically Endangered (80 percent certain, bas...",25.0,Athabaskan-Eyak-Tlingit; Dene (Athabaskan),,Tanana is the language of the Lower Tanana riv...,USA;,...,Critically endangered,Lower Tanana,,15.0,"Krauss 2007, \nGolla et al. ms.",64.6050,-149.0625,"villages of Minto and Nenana, was also spoken ...",-40.000000,-10.0
2,1043,con,A'ingae,Kofane; Cofán; Kofán; A'i; A'ingaé; Colin; Kof...,"Vulnerable (100 percent certain, based on the ...",1500.0,Isolate; South American,,,Colombia;Ecuador;,...,Definitely endangered,"Cofan, Kofan (see Colombia)",,700.0,"- CODENPE - PRODEPINE, Diagnóstico Participati...",0.0659,-76.7065,"In Ecuador: Province of Sucumbíos, Lago Agrio ...",-53.333333,-800.0
3,1043,con,A'ingae,Kofane; Cofán; Kofán; A'i; A'ingaé; Colin; Kof...,"Vulnerable (100 percent certain, based on the ...",1500.0,Isolate; South American,,,Colombia;Ecuador;,...,Severely endangered,"Kofan, A'i, A'ingae, Kofane, Cofan",,379.0,Fundación Zio-A'I Unión de Sabiduría - Censo -...,0.4525,-76.9191,"Colombia-Ecuador border area. In Colombia, Dep...",-74.733333,-1121.0
4,3581,aas,Aasáx,"Asax; Asá; Aasá; Assa; Asak; ""Ndorobo""; ""Dorob...",Dormant,0.0,Afro-Asiatic; Cushitic; South Cushitic,,,Tanzania;,...,Extinct,Asa,,0.0,Maarten Mous,-5.1948,37.7380,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21121,850,tli,Łingít (Tlingit),Tlinkit; Thlinget; Inland Tlingit; Lingit; Kol...,"Critically Endangered (80 percent certain, bas...",200.0,Athabaskan-Eyak-Tlingit; Tlingit,,,USA;Canada;,...,Critically endangered,Tlinkit,,300.0,Krauss 2007,58.3340,-133.7475,Tlingit area is the southeast Alaska coast fro...,50.000000,100.0
21122,850,tli,Łingít (Tlingit),Tlinkit; Thlinget; Inland Tlingit; Lingit; Kol...,"Critically Endangered (80 percent certain, bas...",200.0,Athabaskan-Eyak-Tlingit; Tlingit,,,USA;Canada;,...,Critically endangered,,,55.0,"1996 and 2001 Censuses, Statistics Canada; bas...",60.1710,-132.7395,Spread over 3 communities in Canada,-72.500000,-145.0
21123,594,huc,ǂHoan,ǂHua; ǂHuan; ≠Hû; =|Hua; =|Hua-Owani; |Hua; |H...,"Severely Endangered (60 percent certain, based...",40.0,Kx'a,,,Botswana;,...,Critically endangered,"ǂHõã, Eastern ǂHõã",,140.0,Kemmonye Monaka (p.c.),-24.1668,24.4885,Pointer in geographic centre of settlement area.,100.000000,100.0
21124,1684,chr,ᏣᎳᎩ (Cherokee),Iroquois; Tsalagi; Tslagi; Rickohockan; Rechah...,"Threatened (20 percent certain, based on the e...",2100.0,Iroquoian,,,USA;,...,Definitely endangered,,,10000.0,"Victor Golla, Ives Goddard, Lyle Campbell, Mar...",36.7544,-98.3569,"Cherokee Nation, OKlahoma, and United Keetoowa...",100.000000,7900.0


In [44]:
print(df_combined[["Name in English_2023", "Speaker Change (%)"]].sort_values(by="Speaker Change (%)", ascending=False).head(10))

      Name in English_2023  Speaker Change (%)
5817              Hla'alua               100.0
15742               Ottawa               100.0
18459           Texistepec               100.0
11455               Ludian               100.0
3836                Dusner               100.0
18460           Thado Chin               100.0
18465                 Thao               100.0
18466              Thavung               100.0
18733              Thulung               100.0
11728          Machiguenga               100.0


In [45]:
df_combined.to_csv('merged_data.csv')

In [46]:
print(df_combined[["Name in English_2023", "Speaker Change (%)"]]
      .sort_values(by="Speaker Change (%)", ascending=True)  # Sort in ascending order
      .head(10))  # Get the top 10 decreasing languages


      Name in English_2023  Speaker Change (%)
12356               Mirití              -100.0
15162              Nyang'i              -100.0
3284                Dhurga              -100.0
1498               Berakou              -100.0
13213             Naka'ela              -100.0
15167                N||ng              -100.0
611                Arapaho              -100.0
8092             Kashubian              -100.0
10911         Lishana Deni              -100.0
10908                Lipan              -100.0


In [47]:
print(df_combined[df_combined["Speaker Change (%)"] == -100.0]
      [["Name in English_2023", "Number of speakers (2023)", "Number of speakers 2024", "Speaker Change (%)"]])


      Name in English_2023  Number of speakers (2023)  \
31            Ainu (Japan)                        2.0   
32            Ainu (Japan)                        2.0   
40            Akkala Saami                        1.0   
318                Amanayé                      100.0   
325                  Andoa                        1.0   
...                    ...                        ...   
20197                Wappo                      100.0   
20502                Wiyot                      100.0   
20521                Yaaku                       50.0   
20820           Yir-Yoront                       15.0   
20835               Yurutí                      687.0   

       Number of speakers 2024  Speaker Change (%)  
31                         0.0              -100.0  
32                         0.0              -100.0  
40                         0.0              -100.0  
318                        0.0              -100.0  
325                        0.0              -100.0

In [48]:
df_combined["Speaker Change (%)"] = df_combined["Speaker Change (%)"].clip(upper=100, lower=-20)


In [49]:
print(df_combined[["Name in English_2023", "Speaker Change (%)"]]
      .sort_values(by="Speaker Change (%)", ascending=True)
      .head(10))

          Name in English_2023  Speaker Change (%)
21125           ᏣᎳᎩ (Cherokee)               -20.0
15157                 Nupbikha               -20.0
15156                  Numbami               -20.0
15154                   Nukini               -20.0
15150                Nsyilxcən               -20.0
14884        Northern Tutchone               -20.0
5248                   Guarayu               -20.0
14882  Northern Straits Salish               -20.0
5239               Gros Ventre               -20.0
5250                Guarequena               -20.0


In [50]:
print(df_combined["Speaker Change (%)"].describe())


count    1732.000000
mean       16.414458
std        45.385438
min       -20.000000
25%       -20.000000
50%         0.000000
75%        33.333333
max       100.000000
Name: Speaker Change (%), dtype: float64


In [55]:
print(df_combined[["Name in English_2023", "Speaker Change (%)"]]
      .sort_values(by="Speaker Change (%)", ascending=True)
      .head(10))


          Name in English_2023  Speaker Change (%)
21125           ᏣᎳᎩ (Cherokee)               -20.0
15157                 Nupbikha               -20.0
15156                  Numbami               -20.0
15154                   Nukini               -20.0
15150                Nsyilxcən               -20.0
14884        Northern Tutchone               -20.0
5248                   Guarayu               -20.0
14882  Northern Straits Salish               -20.0
5239               Gros Ventre               -20.0
5250                Guarequena               -20.0


In [None]:
# Encode the target variable (Degree of Endangerment)
label_encoder = LabelEncoder()
df_combined["Degree of endangerment_2024_encoded"] = label_encoder.fit_transform(df_combined["Degree of endangerment_2024"])

# Define features and target variable
features = ["Speaker Change (%)", "Number of speakers 2024"]
target = "Degree of endangerment_2024_encoded"  

X = df_combined[features]
y = df_combined[target]

# Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train Logistic Regression Model
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train, y_train)

# Make predictions
y_pred = log_reg.predict(X_test)

# Evaluate model performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Accuracy: 0.40161725067385445
                       precision    recall  f1-score   support

Critically endangered       0.38      0.95      0.54       101
Definitely endangered       0.46      0.50      0.48        98
              Extinct       0.00      0.00      0.00        17
  Severely endangered       0.00      0.00      0.00        74
           Vulnerable       0.31      0.05      0.09        81

             accuracy                           0.40       371
            macro avg       0.23      0.30      0.22       371
         weighted avg       0.29      0.40      0.29       371



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [75]:
features = ["Speaker Change (%)", "Number of speakers 2024"]

In [76]:
# Encode "Degree of endangerment_2023" into numbers
df_combined["Degree of endangerment_2023_encoded"] = label_encoder.fit_transform(df_combined["Degree of endangerment_2023"])

# Define features with more information
features = ["Speaker Change (%)", "Number of speakers 2024", "Number of speakers (2023)", "Degree of endangerment_2023_encoded"]

# Define target variable
target = "Degree of endangerment_2024_encoded"

# Select X (features) and y (target)
X = df_combined[features]
y = df_combined[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [82]:
# Apply SMOTE to the training dataset
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Check class distribution after SMOTE
print("Class distribution after SMOTE:")
print(pd.Series(y_train_resampled).value_counts())


Class distribution after SMOTE:
Degree of endangerment_2024_encoded
2    387
0    387
3    387
4    387
1    387
Name: count, dtype: int64


In [84]:
# Train Logistic Regression Model on Balanced Data
log_reg = LogisticRegression(max_iter=5000, solver="lbfgs", random_state=42)
log_reg.fit(X_train_resampled, y_train_resampled)

# Make predictions on test set
y_pred = log_reg.predict(X_test)

# Evaluate model performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=label_encoder.inverse_transform(sorted(set(y_test)))))

Accuracy: 0.3288409703504043
                                                                              precision    recall  f1-score   support

               At risk (20 percent certain, based on the evidence available)       0.46      0.33      0.38       101
               At risk (60 percent certain, based on the evidence available)       0.60      0.24      0.35        98
                                                                Awakening ()       0.14      0.76      0.24        17
Critically Endangered (100 percent certain, based on the evidence available)       0.60      0.04      0.08        74
 Critically Endangered (20 percent certain, based on the evidence available)       0.30      0.60      0.40        81

                                                                    accuracy                           0.33       371
                                                                   macro avg       0.42      0.40      0.29       371
                         

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
