# Implementation!
This notebook is simply an answer for your questions but using code. please take into account that the notebook below was made quickly due to lack of time - I can do even better then that ;)

# Plan

In [1]:
# import libs and tools
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt 
import seaborn as sns
import spacy
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from collections import Counter
import language_tool_python
import textstat
import re
from textblob import TextBlob
import language_tool_python

In [2]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Adam\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
# to display whole columns
pd.set_option('display.max_colwidth', None)

In [4]:
# load the transcripts
df = pd.read_csv(r"C:\Users\Adam\Desktop\main\programming\machine learning\voice call analyzer\data\sample_transcripts.csv")
df.head()

Unnamed: 0,transcript_id,transcript
0,1,"Hello, my name is John Doe. I’m calling from New York regarding an issue with my internet connection. It’s been down since Monday. Can you help me fix it?"
1,2,"Uh, so like, um, I bought this thing and it’s, uh, not working. You know? Like, I need, uh, help or something."
2,3,"Good afternoon, I’m Maria Sanchez. I placed an order last week and I haven't received a confirmation email. Could you check on that for me?"
3,4,"Yeah hi, yeah, the delivery never came. It’s, like, really annoying. Uh, I want a refund or whatever."
4,5,"Hi, I’m Dr. Alan Turing, and I was inquiring about the status of my reimbursement request. It was submitted on the 10th of June. Please advise."


In [39]:
# word count col
df['word_count'] = df['transcript'].apply(lambda x: len([w for w in word_tokenize(x) if w.isalpha()]))

In [40]:
df.head()

Unnamed: 0,transcript_id,transcript,word_count
0,1,"Hello, my name is John Doe. I’m calling from New York regarding an issue with my internet connection. It’s been down since Monday. Can you help me fix it?",31
1,2,"Uh, so like, um, I bought this thing and it’s, uh, not working. You know? Like, I need, uh, help or something.",23
2,3,"Good afternoon, I’m Maria Sanchez. I placed an order last week and I haven't received a confirmation email. Could you check on that for me?",26
3,4,"Yeah hi, yeah, the delivery never came. It’s, like, really annoying. Uh, I want a refund or whatever.",19
4,5,"Hi, I’m Dr. Alan Turing, and I was inquiring about the status of my reimbursement request. It was submitted on the 10th of June. Please advise.",25


### 2. Extract Information
We’ll extract these features:
* Basic NLP metrics
* Sentence structure
* Sentiment
* Filler words
* Named Entity Recognition (e.g., names, orgs)
* POS diversity
* Grammar error count
This information will help us to check if transcription is good for Rnglish level measure.

In [11]:
tool = language_tool_python.LanguageTool('en-US')
nlp = spacy.load("en_core_web_sm")

In [45]:
def extract_info(transcript: str):
    """
    Extracts linguistic, grammatical, and semantic features from a transcript.
    Returns a dictionary with metrics such as word count, uniqueness, sentence length,
    sentiment, filler ratio, named entities, POS diversity, and grammar score.
    """
    doc = nlp(transcript)

    # basic NLP metrics
    word_count = len([token.text for token in doc if token.is_alpha or token.is_digit])
    unique_words = set(token.text.lower() for token in doc if token.is_alpha)
    unique_ratio = len(unique_words) / word_count if word_count else 0
    
    # sentence structure
    sentences = list(doc.sents)
    avg_sentence_len = sum(len(sent) for sent in sentences) / len(sentences) if sentences else 0

    # sentiment
    sentiment = TextBlob(transcript).sentiment.polarity

    # filler words
    fillers = re.findall(r'\b(uh|um|you know|like|erm|hmm)\b', transcript.lower())
    filler_ratio = len(fillers) / word_count if word_count else 0

    # named Entity Recognition (e.g., names, orgs)
    entities = [(ent.text, ent.label_) for ent in doc.ents]

    # POS diversity
    pos_tags = [token.pos_ for token in doc]
    pos_diversity = len(set(pos_tags)) / len(pos_tags) if pos_tags else 0
    
    # grammar error count
    grammar_matches = tool.check(transcript)
    grammar_errors = len(grammar_matches)
    grammar_score = max(0, 1 - (grammar_errors / len(sentences))) if sentences else 0
    
    return {
        "word_count": word_count,
        "unique_ratio": unique_ratio,
        "avg_sentence_len": avg_sentence_len,
        "sentiment": sentiment,
        "filler_ratio": filler_ratio,
        "entities": entities,
        "pos_diversity": pos_diversity,
        "grammar_errors": grammar_errors,
        "grammar_score": round(grammar_score, 2)
    }

In [43]:
sample ='Hello, my name is John Doe. I’m calling from New York regarding an issue with my internet connection. It’s been down since Monday. Can you help me fix it?	'

In [44]:
info = extract_info(sample)
print("Extracted Info:", info)

Extracted Info: {'word_count': 29, 'unique_ratio': 0.9310344827586207, 'avg_sentence_len': 9.25, 'sentiment': -0.009595959595959616, 'filler_ratio': 0.0, 'entities': [('John Doe', 'PERSON'), ('New York', 'GPE'), ('Monday', 'DATE')], 'pos_diversity': 0.32432432432432434, 'grammar_errors': 0, 'grammar_score': 1.0}


###  3.Is good?
The next step is to check if our transcription is good for further preprocessing. It can help us determine the accuracy of prediction.

In [46]:
def is_transcription_good(info: dict):
    """
    Evaluates if a transcription meets quality thresholds for word count, 
    lexical richness, sentiment neutrality, filler frequency, and presence of named entities.
    Returns True if quality is sufficient, otherwise False.
    """
    return (
        info["word_count"] >= 30 and
        info["unique_ratio"] >= 0.4 and
        abs(info["sentiment"]) < 0.8 and  
        info["filler_ratio"] < 0.05 and
        any(label in ['PERSON', 'ORG', 'GPE'] for _, label in info["entities"])  
    )

In [28]:
print("Is transcription good?", is_transcription_good(info))

Is transcription good? False


### 4. English level
Finally we need to measure English lvl. Logic for measuring language proficiency level is based on rule-based heuristics inspired by CEFR.

In [49]:
def estimate_proficiency(info: dict, quality_threshold: float = 0.5):
    """
    Estimates language proficiency level (A1 to C2+) based on various linguistic metrics,
    including word count, lexical diversity, sentence length, filler usage, grammar, and POS diversity.
    Returns a CEFR-aligned proficiency level label.
    """
    level_score = 0

    if info["word_count"] > 50: level_score += 1
    if info["unique_ratio"] > 0.4: level_score += 1
    if info["avg_sentence_len"] > 12: level_score += 1
    if info["filler_ratio"] < 0.03: level_score += 1
    if info["grammar_score"] > 0.75: level_score += 1
    if info["pos_diversity"] > 0.3: level_score += 1

    # rule-based scoring
    levels = {
        0: "A1 (Beginner)",
        1: "A2 (Elementary)",
        2: "B1 (Intermediate)",
        3: "B2 (Upper Intermediate)",
        4: "C1 (Advanced)",
        5: "C2 (Proficient)",
        6: "C2+ (Near-native)"
    }
    return levels.get(level_score, "A1 (Beginner)")

In [50]:
print("Estimated English level:", estimate_proficiency(info))

Estimated English level: C1 (Advanced)


### 5. Let's connect everthing!
Finally, now we can enjoy the fruit of our work! Now I just need to simply connect all the functions.

In [60]:
def process_transcripts(df):
    results = []
    counter = 0
    for idx, row in df.iterrows():
        counter+=1
        print(f'Row nr. {counter}\n')
        transcript = row['transcript']
        info = extract_info(transcript)
        is_good = is_transcription_good(info)
        level = estimate_proficiency(info)

        results.append({
            'id': row.get("id",idx),
            'word_count':info['word_count'],
            'unique_ratio':round(info['unique_ratio'],2),
            "avg_sentence_len": round(info["avg_sentence_len"], 2),
            "sentiment": round(info["sentiment"], 2),
            "filler_ratio": round(info["filler_ratio"], 2),
            "grammar_score": info["grammar_score"],
            "pos_diversity": round(info["pos_diversity"], 2),
            "entities": info["entities"],
            "grammar_errors": info["grammar_errors"],
            "is_good": is_good,
            "estimated_level": level
        })
        print(f'Results: {results}')
    return pd.DataFrame(results)

In [61]:
evaluated_df = process_transcripts(df)

Row nr. 1

Results: [{'id': 0, 'word_count': 29, 'unique_ratio': 0.93, 'avg_sentence_len': 9.0, 'sentiment': -0.01, 'filler_ratio': 0.0, 'grammar_score': 1.0, 'pos_diversity': 0.31, 'entities': [('John Doe', 'PERSON'), ('New York', 'GPE'), ('Monday', 'DATE')], 'grammar_errors': 0, 'is_good': False, 'estimated_level': 'C1 (Advanced)'}]
Row nr. 2

Results: [{'id': 0, 'word_count': 29, 'unique_ratio': 0.93, 'avg_sentence_len': 9.0, 'sentiment': -0.01, 'filler_ratio': 0.0, 'grammar_score': 1.0, 'pos_diversity': 0.31, 'entities': [('John Doe', 'PERSON'), ('New York', 'GPE'), ('Monday', 'DATE')], 'grammar_errors': 0, 'is_good': False, 'estimated_level': 'C1 (Advanced)'}, {'id': 1, 'word_count': 22, 'unique_ratio': 0.82, 'avg_sentence_len': 11.33, 'sentiment': 0.0, 'filler_ratio': 0.32, 'grammar_score': 0.67, 'pos_diversity': 0.26, 'entities': [], 'grammar_errors': 1, 'is_good': False, 'estimated_level': 'A2 (Elementary)'}]
Row nr. 3

Results: [{'id': 0, 'word_count': 29, 'unique_ratio': 0.93

In [62]:
evaluated_df

Unnamed: 0,id,word_count,unique_ratio,avg_sentence_len,sentiment,filler_ratio,grammar_score,pos_diversity,entities,grammar_errors,is_good,estimated_level
0,0,29,0.93,9.0,-0.01,0.0,1.0,0.31,"[(John Doe, PERSON), (New York, GPE), (Monday, DATE)]",0,False,C1 (Advanced)
1,1,22,0.82,11.33,0.0,0.32,0.67,0.26,[],1,False,A2 (Elementary)
2,2,25,0.92,10.33,0.35,0.0,1.0,0.35,"[(afternoon, TIME), (Maria Sanchez, PERSON), (last week, DATE)]",0,False,C1 (Advanced)
3,3,18,0.94,9.0,-0.8,0.11,1.0,0.33,[],0,False,B2 (Upper Intermediate)
4,4,24,0.83,10.67,0.0,0.0,1.0,0.31,"[(Alan Turing, PERSON), (the 10th of June, DATE)]",0,False,C1 (Advanced)


In [64]:
# save the result
df.to_csv('output.csv', index=False)