# Language Proficiency Classification

In [11]:
import os
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import joblib
import librosa

### Read Json Data


In [2]:
# Step 1: Preprocess JSON Data
def read_json_features(folder_path):
    data = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.json'):
            with open(os.path.join(folder_path, file_name)) as f:
                features = json.load(f)
                features['id'] = file_name.split('.')[0]  # Extract ID from filename
                data.append(features)
    return pd.DataFrame(data)

features_df = read_json_features('features')

### Read Human Scores

In [4]:

# Step 2: Preprocess Scores File
def read_scores(file_path):
    scores = {}
    with open(file_path) as f:
        for line in f:
            parts = line.split()
            id = parts[0]
            scores_list = [float(score) for score in parts[1:] if score]
            scores[id] = sum(scores_list) / len(scores_list)  # Average if double scored
    return scores

scores = read_scores('/Users/emreugur/Downloads/DATASETS/L2Corpus/scores.txt')
scores_df = pd.DataFrame(list(scores.items()), columns=['id', 'score'])

### Merge Data

In [5]:
# Step 3: Merge Data
merged_df = pd.merge(features_df, scores_df, on='id')

In [6]:
pd.set_option('display.max_columns', None)  # This will allow all columns to be displayed
# Display the first 5 rows of the DataFrame
print(merged_df.head().to_string(index=False))

 total_duration  total_words  unique_words_count  average_chunk_length_in_words  articulation_rate  mean_deviation_of_chunks_in_words  duration_of_silences_per_word  mean_of_silence_duration  mean_duration_of_long_pauses  frequency_of_longer_pauses_divided_by_number_of_words  types_divided_by_uttsegdur  mean_length_of_filled_pauses  frequency_of_filled_pauses     id  score
         56.988          100                  64                       2.702703           2.907061                           1.700511                       0.224600                  0.623889                      0.967500                                               0.180000                    1.123043                      0.439091                    0.193023 010_04    2.5
         45.517           79                  54                       2.724138           3.144153                           1.607610                       0.256987                  0.725071                      1.156067                            

### Train

In [7]:
# Step 4: Train the Model
X = merged_df.drop(['id', 'score'], axis=1)
y = merged_df['score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

### Save Model

In [12]:
# Save the trained model to a file
model_filename = 'trained_model.joblib'
joblib.dump(model, model_filename)

print(f'Model saved to {model_filename}')

Model saved to trained_model.joblib


### Load Model


In [13]:
# Load the trained model from the file
model = joblib.load(model_filename)

### Evaluate

In [10]:
# Predict and evaluate
y_pred = model.predict(X_test)



# Assuming y_test are the true values and y_pred are the predictions from your model

# Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error: {rmse}')

# Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

# R-squared (Coefficient of Determination)
r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')

# Adjusted R-squared
n = len(y_test)  # Number of observations
p = X_test.shape[1]  # Number of independent variables
adj_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))
print(f'Adjusted R-squared: {adj_r2}')


Mean Squared Error: 0.20829736865260196
Root Mean Squared Error: 0.45639606555337653
Mean Absolute Error: 0.39158691063599244
R-squared: 0.04759501180492509
Adjusted R-squared: -0.46829102346740714


### Predict

In [22]:
# Specify the ID and construct the file path
file_id = '001_01'  # Replace 'your_id' with the actual ID
file_path = f'features/{file_id}.json'  # Constructs the path to the JSON file

# Load and extract features from the JSON file
# Load and extract features from the JSON file into a DataFrame
with open(file_path, 'r') as file:
    data = json.load(file)
    # Convert the dictionary to a DataFrame, which preserves the feature names
    features = pd.DataFrame([data])

# Pass the features through the model to get the prediction
predicted_score = model.predict(features)

print(f'Predicted Score for ID {file_id}: {predicted_score[0]}')


Predicted Score for ID 001_01: 3.141354165600726
