### Importing necessary Libraries

In [2]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import plotly.express as px

In [3]:
df = pd.read_csv("cv-invalid.csv")
df

Unnamed: 0,filename,text,up_votes,down_votes,age,gender,accent,duration
0,cv-invalid/sample-000000.mp3,revenge is not my style but obviously accident...,1,10,,,,
1,cv-invalid/sample-000001.mp3,it was bunched up and he had hardly thought of...,0,2,twenties,male,us,
2,cv-invalid/sample-000002.mp3,then suddenly he noticed it with a start,10,4,thirties,female,canada,
3,cv-invalid/sample-000003.mp3,that's the point at which most people give up,0,1,,,,
4,cv-invalid/sample-000004.mp3,you got someplace to sleep,0,1,,,,
...,...,...,...,...,...,...,...,...
25398,cv-invalid/sample-025398.mp3,well then we've got a problem,0,4,,,,
25399,cv-invalid/sample-025399.mp3,the boy was surprised at his thoughts,0,6,,,,
25400,cv-invalid/sample-025400.mp3,undefined,1,2,,,,
25401,cv-invalid/sample-025401.mp3,but there was something there in his heart tha...,1,5,,,,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25403 entries, 0 to 25402
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   filename    25403 non-null  object 
 1   text        25403 non-null  object 
 2   up_votes    25403 non-null  int64  
 3   down_votes  25403 non-null  int64  
 4   age         5851 non-null   object 
 5   gender      5868 non-null   object 
 6   accent      5008 non-null   object 
 7   duration    0 non-null      float64
dtypes: float64(1), int64(2), object(5)
memory usage: 1.6+ MB


In [5]:
df.head()

Unnamed: 0,filename,text,up_votes,down_votes,age,gender,accent,duration
0,cv-invalid/sample-000000.mp3,revenge is not my style but obviously accident...,1,10,,,,
1,cv-invalid/sample-000001.mp3,it was bunched up and he had hardly thought of...,0,2,twenties,male,us,
2,cv-invalid/sample-000002.mp3,then suddenly he noticed it with a start,10,4,thirties,female,canada,
3,cv-invalid/sample-000003.mp3,that's the point at which most people give up,0,1,,,,
4,cv-invalid/sample-000004.mp3,you got someplace to sleep,0,1,,,,


In [6]:
df.shape

(25403, 8)

In [7]:
df.isnull().sum()

filename          0
text              0
up_votes          0
down_votes        0
age           19552
gender        19535
accent        20395
duration      25403
dtype: int64

In [8]:
# Check for missing values that are greater that 0 
[features for features in df.columns if df[features].isnull().sum()>0] 

['age', 'gender', 'accent', 'duration']

In [9]:
df.drop(columns=['duration','age','gender','accent'], inplace=True)
print('Cleaned Data:')
print(df.head())

Cleaned Data:
                       filename  \
0  cv-invalid/sample-000000.mp3   
1  cv-invalid/sample-000001.mp3   
2  cv-invalid/sample-000002.mp3   
3  cv-invalid/sample-000003.mp3   
4  cv-invalid/sample-000004.mp3   

                                                text  up_votes  down_votes  
0  revenge is not my style but obviously accident...         1          10  
1  it was bunched up and he had hardly thought of...         0           2  
2           then suddenly he noticed it with a start        10           4  
3      that's the point at which most people give up         0           1  
4                         you got someplace to sleep         0           1  


In [10]:
df.keys()

Index(['filename', 'text', 'up_votes', 'down_votes'], dtype='object')

### Plotting a line graph

In [11]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(df[['up_votes']], df['down_votes'])

# predicting the values
df['y_pred'] = model.predict(df[['up_votes']])

# Visualizing the results using plotly
fig = px.scatter(df, x='up_votes', y='down_votes', opacity=0.45, 
                 labels={'up_votes': 'Up Votes', 'down_votes': 'Down Votes'},
                 title='Best fit line for line plot')

# Adding the regression line to the plot
fig.add_scatter(x = df['up_votes'], y = df['y_pred'], mode='lines', name = 'Best fit line')
fig.show()

### Feature Extraction with Librosa

In [4]:
import librosa
import pandas as pd
import numpy as np
import zipfile
import os
from joblib import Parallel, delayed
from tqdm import tqdm
import re

# --- Configuration ---
ZIP_PATH = "/media/mk4700/disk/save_Disk/f/archive.zip"
CSV_IN_ZIP = "cv-invalid.csv"
OUTPUT_FILE = "audio_features.csv"
BATCH_SIZE = 500  # Audio feature batch size

# --- Load and Fix Paths ---
with zipfile.ZipFile(ZIP_PATH) as z:
    with z.open(CSV_IN_ZIP) as f:
        df = pd.read_csv(f)
    df['filename'] = df['filename'].str.replace(
        'cv-invalid/',
        'cv-valid-train/cv-valid-train/',
        regex=False
    )
    zip_files = set(z.namelist())
    df = df[df['filename'].isin(zip_files)].copy()

print(f"Found {len(df)} valid files to process")

# --- Simple Text Quality Score (No Java Required) ---
def calc_text_quality(text):
    """
    Calculate text quality based on:
    - Proper capitalization
    - Punctuation usage
    - Word length distribution
    - No excessive repetition
    """
    if not text or len(text.strip()) == 0:
        return 0.1
    
    score = 1.0
    words = text.split()
    
    # Check capitalization (first word should be capitalized)
    if words and not words[0][0].isupper():
        score -= 0.2
    
    # Check for punctuation
    if not re.search(r'[.!?,;:]', text):
        score -= 0.2
    
    # Check word diversity (penalize excessive repetition)
    if len(words) > 0:
        unique_ratio = len(set(words)) / len(words)
        if unique_ratio < 0.5:
            score -= 0.2
    
    # Check for proper sentence structure (not all lowercase/uppercase)
    if text.isupper() or text.islower():
        score -= 0.1
    
    # Check average word length (2-8 is normal)
    if words:
        avg_word_len = sum(len(w) for w in words) / len(words)
        if avg_word_len < 2 or avg_word_len > 10:
            score -= 0.1
    
    return max(0.1, min(1.0, score))

# --- Parallel Text Quality Scoring ---
def calc_quality_batch(texts):
    """Process a batch of texts"""
    return [calc_text_quality(text) for text in texts]

# --- Apply Text Quality Score in Parallel ---
if 'text_quality_score' not in df.columns:
    print("Calculating text quality scores in parallel...")
    texts = df['text'].tolist()
    num_jobs = min(os.cpu_count(), 8)  # Limit to avoid overhead
    chunk_size = max(100, len(texts) // num_jobs)
    
    text_chunks = [texts[i:i+chunk_size] for i in range(0, len(texts), chunk_size)]
    
    quality_scores = Parallel(n_jobs=num_jobs)(
        delayed(calc_quality_batch)(chunk) for chunk in tqdm(text_chunks, desc="Scoring text quality")
    )
    
    df['text_quality_score'] = [score for sublist in quality_scores for score in sublist]
    df = df.dropna(subset=['text_quality_score'])
    print(f" Text quality scores calculated for {len(df)} samples")

# --- Audio Feature Extraction ---
def extract_features(file_path_in_zip):
    try:
        with zipfile.ZipFile(ZIP_PATH) as z:
            with z.open(file_path_in_zip) as f:
                y, sr = librosa.load(f, sr=22050, res_type='kaiser_fast')
                mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
                features = {
                    'filename': file_path_in_zip,
                    **{f'mfcc_{i}_mean': np.mean(mfcc[i]) for i in range(13)},
                    'zcr_mean': np.mean(librosa.feature.zero_crossing_rate(y)),
                    'spectral_centroid': np.mean(librosa.feature.spectral_centroid(y=y, sr=sr)),
                    'spectral_rolloff': np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
                }
                return features
    except Exception as e:
        print(f" Error processing {file_path_in_zip}: {str(e)}")
        return None

# --- Continue if already exists ---
processed_files = set()
if os.path.exists(OUTPUT_FILE):
    existing_df = pd.read_csv(OUTPUT_FILE)
    processed_files = set(existing_df['filename'])
    print(f" Resuming from {len(processed_files)} processed files")

# Get remaining files to process
remaining = [row for _, row in df.iterrows()
             if row['filename'] not in processed_files]

print(f"ðŸ“Š Processing {len(remaining)} remaining files...")

# --- Process in Batches ---
for i in tqdm(range(0, len(remaining), BATCH_SIZE), desc="ðŸŽ§ Extracting audio features", unit="batch"):
    batch = remaining[i:i + BATCH_SIZE]
    
    # Extract audio features in parallel
    results = Parallel(n_jobs=-1)(
        delayed(extract_features)(row['filename']) for row in batch
    )
    
    # Filter successful extractions
    successful = [r for r in results if r is not None]
    
    if successful:
        successful_df = pd.DataFrame(successful)
        # Add text quality scores
        successful_df['text_quality_score'] = [
            row['text_quality_score'] for row in batch[:len(successful)]
        ]
        
        # Append to CSV
        successful_df.to_csv(
            OUTPUT_FILE, 
            mode='a',
            header=not os.path.exists(OUTPUT_FILE),
            index=False
        )

print(f"âœ… Feature extraction complete! Results saved to {OUTPUT_FILE}")



SyntaxError: unterminated string literal (detected at line 16) (3660038147.py, line 16)

In [15]:
df_check = pd.read_csv('audio_features.csv')
df_check.shape
df_check.head()

Unnamed: 0,Unnamed: 1,filename,grammar_score,mfcc_0_mean,mfcc_1_mean,mfcc_2_mean,mfcc_3_mean,mfcc_4_mean,mfcc_5_mean,mfcc_6_mean,mfcc_7_mean,mfcc_8_mean,mfcc_9_mean,mfcc_10_mean,mfcc_11_mean,mfcc_12_mean,zcr_mean
cv-valid-train/cv-valid-train/sample-000000.mp3,-271.31665,117.64021,-31.829422,28.0848,-3.839317,0.620063,-5.123722,-9.072942,-7.756686,2.387066,-11.028577,5.967481,-11.820976,0.125673,2055.841922,3836.194882,0.5
cv-valid-train/cv-valid-train/sample-000001.mp3,-699.8641,79.09568,4.884256,20.39182,-1.722654,2.290766,-7.017201,3.141892,-10.726935,4.120934,-7.072683,2.027022,-5.483835,0.214634,2897.610159,5727.237556,0.5
cv-valid-train/cv-valid-train/sample-000002.mp3,-335.8503,62.55013,13.792666,40.112946,6.636197,17.079103,-13.959596,5.31031,-18.189528,4.219509,-11.414381,-0.935486,-27.50059,0.152703,2711.420157,5271.210135,0.5
cv-valid-train/cv-valid-train/sample-000003.mp3,-529.704,96.79963,6.058775,16.39942,-1.047446,2.513682,-13.614449,-10.405475,-11.53866,-4.264429,-5.259208,-2.680506,-6.554389,0.143143,2240.564428,4441.354445,0.5
cv-valid-train/cv-valid-train/sample-000004.mp3,-78.12065,92.09714,-13.072988,29.388163,1.074569,11.947689,-11.105109,7.265137,-10.549742,-2.479722,-15.612686,0.858765,-16.99436,0.105473,2240.401652,4363.943438,0.5
