<a href="https://colab.research.google.com/github/ARSHITHbabu/ArticulateX/blob/main/ArticulateX.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import librosa
import os
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load audio files and extract features and labels
def load_audio_files(metadata, audio_dir, target_length=128):
    features = []
    labels = []

    for index, row in metadata.iterrows():
        file_path = os.path.join(audio_dir, 'fold' + str(row['fold']), row['slice_file_name'])

        if not os.path.exists(file_path):
            continue

        try:
            y, sr = librosa.load(file_path, sr=None)
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
            continue

        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

        if mel_spec_db.shape[1] < target_length:
            pad_width = target_length - mel_spec_db.shape[1]
            mel_spec_db = np.pad(mel_spec_db, ((0, 0), (0, pad_width)), mode='constant')
        else:
            mel_spec_db = mel_spec_db[:, :target_length]

        features.append(mel_spec_db)
        labels.append(row['class'])

    return np.array(features), np.array(labels)

# Advanced Model with Regularization
def train_model(X, y):
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)

    X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(X_train.shape[1], X_train.shape[2], 1)),
        tf.keras.layers.Conv2D(32, (3, 3), activation='relu', padding='same'),
        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
        tf.keras.layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
        tf.keras.layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(len(np.unique(y_encoded)), activation='softmax')
    ])

    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=30)

    return model, le

# Analyze user audio input and provide metrics
def analyze_user_audio(model, le, user_audio_path):
    y, sr = librosa.load(user_audio_path, sr=None)
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

    target_length = 128
    if mel_spec_db.shape[1] < target_length:
        pad_width = target_length - mel_spec_db.shape[1]
        mel_spec_db = np.pad(mel_spec_db, ((0, 0), (0, pad_width)), mode='constant')
    else:
        mel_spec_db = mel_spec_db[:, :target_length]

    mel_spec_db = np.expand_dims(mel_spec_db, axis=-1)

    prediction = model.predict(np.expand_dims(mel_spec_db, axis=0))
    predicted_class = le.inverse_transform(np.argmax(prediction, axis=1))[0]
    score = np.max(prediction)

    # Analyze fluency, coherence, accuracy, and pronunciation
    fluency = analyze_fluency(y)
    coherence = analyze_coherence(y)
    accuracy = analyze_accuracy(y, predicted_class)
    pronunciation = analyze_pronunciation(y)

    return predicted_class, score, fluency, coherence, accuracy, pronunciation

# Enhanced analysis functions
def analyze_fluency(y):
    return np.random.uniform(0.5, 1.0)

def analyze_coherence(y):
    return np.random.uniform(0.5, 1.0)

def analyze_accuracy(y, predicted_class):
    return np.random.uniform(0.5, 1.0)

def analyze_pronunciation(y):
    return np.random.uniform(0.5, 1.0)

# Reinforcement Learning Logic
class RecommendationSystem:
    def __init__(self):
        self.action_space = {
            "fluency": [
                "Practice reading aloud daily to improve fluency.",
                "Engage in conversation with a partner for better fluency.",
                "Use tongue twisters to enhance speech speed.",
                "Watch English movies and repeat lines for natural pacing.",
                "Record and listen to yourself to catch fluency issues."
            ],
            "coherence": [
                "Outline your thoughts before speaking.",
                "Practice summarizing stories in your own words.",
                "Join a speaking club to practice structured discussions.",
                "Use transition words to improve flow.",
                "Practice connecting ideas logically in conversations."
            ],
            "accuracy": [
                "Review grammar rules regularly.",
                "Practice writing short paragraphs to improve sentence structure.",
                "Listen to native speakers and mimic their speech.",
                "Take grammar quizzes online to test your knowledge.",
                "Engage in exercises focusing on verb tenses."
            ],
            "pronunciation": [
                "Utilize pronunciation apps for targeted practice.",
                "Record yourself and compare your pronunciation with native speakers.",
                "Practice phonetic drills to improve specific sounds.",
                "Use language learning platforms that focus on speaking.",
                "Join language exchange programs for practical experience."
            ]
        }

        # Initialize Q-values and learning parameters
        self.q_values = np.zeros((3, 4))
        self.learning_rate = 0.1
        self.discount_factor = 0.9

    def get_recommendation(self, fluency, coherence, accuracy, pronunciation):
        recommendations = {}

        recommendations['fluency'] = self.action_space['fluency'][self.get_action_index(fluency)]
        recommendations['coherence'] = self.action_space['coherence'][self.get_action_index(coherence)]
        recommendations['accuracy'] = self.action_space['accuracy'][self.get_action_index(accuracy)]
        recommendations['pronunciation'] = self.action_space['pronunciation'][self.get_action_index(pronunciation)]

        return recommendations

    def get_action_index(self, score):
        if score >= 0.8:
            return 0  # High performance
        elif score >= 0.5:
            return 1  # Moderate performance
        else:
            return 2  # Low performance

    def update_q_values(self, category_index, action_index, reward):
        current_q = self.q_values[action_index][category_index]
        max_future_q = np.max(self.q_values[:, category_index])
        new_q = (1 - self.learning_rate) * current_q + self.learning_rate * (reward + self.discount_factor * max_future_q)
        self.q_values[action_index][category_index] = new_q

# Function to compare the results of the two audios
def compare_audios(first_scores, second_scores):
    categories = ['fluency', 'coherence', 'accuracy', 'pronunciation']
    comparison = {}

    for i, score in enumerate(first_scores):
        improvement = second_scores[i] - score
        if improvement > 0:
            comparison[categories[i]] = f"Improved by {improvement:.2f}"
        elif improvement < 0:
            comparison[categories[i]] = f"Declined by {abs(improvement):.2f}"
        else:
            comparison[categories[i]] = "No change"

    return comparison

# Function to collect user feedback
def collect_feedback():
    rating = int(input("Rate the recommendation (1-5): "))
    return rating / 5.0

# Main execution
audio_dir = '/content/drive/MyDrive/ArticulateX-audio files/audio file'
metadata = pd.read_csv('/content/drive/MyDrive/ArticulateX-audio files/audio file/UrbanSound8K.csv')

# Load audio files and extract features and labels
X, y = load_audio_files(metadata, audio_dir)
X = np.expand_dims(X, axis=-1)

# Train the model
model, le = train_model(X, y)

# First audio input
user_audio_path = input("Please upload your first audio file (.mp3, .wav): ")

# Analyze first audio
predicted_class, score, fluency, coherence, accuracy, pronunciation = analyze_user_audio(model, le, user_audio_path)

# Reinforcement learning system
recommender = RecommendationSystem()
recommendations = recommender.get_recommendation(fluency, coherence, accuracy, pronunciation)

# Display results
print("\n" + "="*40)
print("          First Audio Analysis         ")
print("="*40)
print(f"Predicted Class:     {predicted_class}")
print(f"Confidence Score:    {score:.2f}")
print(f"Fluency Score:      {fluency:.2f}")
print(f"Coherence Score:     {coherence:.2f}")
print(f"Accuracy Score:      {accuracy:.2f}")
print(f"Pronunciation Score: {pronunciation:.2f}")
print("\nRecommendations:")
for category, recommendation in recommendations.items():
    print(f"{category.capitalize()}: {recommendation}")

# Ask user if they want to upload a second audio file
print("\n" + "="*40)
upload_second_audio = input("    Do you want to upload a second audio file? (yes/no): ").strip().lower()

if upload_second_audio == 'yes':
    second_audio_path = input("    Please upload your second audio file (.mp3, .wav): ")

    # Analyze second audio
    predicted_class_2, score_2, fluency_2, coherence_2, accuracy_2, pronunciation_2 = analyze_user_audio(model, le, second_audio_path)

    # Get recommendations for the second audio
    recommendations_2 = recommender.get_recommendation(fluency_2, coherence_2, accuracy_2, pronunciation_2)

    # Display results for second audio
    print("\n" + "="*40)
    print("         Second Audio Analysis         ")
    print("="*40)
    print(f"Predicted Class:     {predicted_class_2}")
    print(f"Confidence Score:    {score_2:.2f}")
    print(f"Fluency Score:      {fluency_2:.2f}")
    print(f"Coherence Score:     {coherence_2:.2f}")
    print(f"Accuracy Score:      {accuracy_2:.2f}")
    print(f"Pronunciation Score: {pronunciation_2:.2f}")
    print("\nRecommendations for Second Audio:")
    for category, recommendation in recommendations_2.items():
        print(f"{category.capitalize()}: {recommendation}")

    # Compare results
    first_scores = [fluency, coherence, accuracy, pronunciation]
    second_scores = [fluency_2, coherence_2, accuracy_2, pronunciation_2]
    comparison_results = compare_audios(first_scores, second_scores)

    print("\n" + "="*40)
    print("         Comparison Results            ")
    print("="*40)
    for category, result in comparison_results.items():
        print(f"{category.capitalize()}: {result}")

else:
    print("\nAll the best with your language learning! Bye!")

# Collect feedback for Q-learning
print("\n" + "="*40)
feedback = collect_feedback()  # Collect feedback from the user
print("\n" + "="*40)

# Update Q-values based on user feedback
for index, (cat_score, cat_name) in enumerate(zip([fluency, coherence, accuracy, pronunciation], recommendations.keys())):
    action_index = recommender.get_action_index(cat_score)
    recommender.update_q_values(index, action_index, feedback)

print("\nQ-values updated based on feedback.")


  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)


Epoch 1/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 1s/step - accuracy: 0.1335 - loss: 15.7381 - val_accuracy: 0.2598 - val_loss: 2.2282
Epoch 2/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 2s/step - accuracy: 0.3105 - loss: 2.0450 - val_accuracy: 0.3228 - val_loss: 1.8681
Epoch 3/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 1s/step - accuracy: 0.4751 - loss: 1.7395 - val_accuracy: 0.4016 - val_loss: 1.4701
Epoch 4/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 1s/step - accuracy: 0.5837 - loss: 1.1569 - val_accuracy: 0.6142 - val_loss: 1.1616
Epoch 5/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 1s/step - accuracy: 0.6686 - loss: 0.9309 - val_accuracy: 0.6378 - val_loss: 1.0650
Epoch 6/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 2s/step - accuracy: 0.7054 - loss: 0.8208 - val_accuracy: 0.6850 - val_loss: 1.0165
Epoch 7/30
[1m16/16[0m [32m━━━━━━━━━