# libraries

In [1]:
import tempfile
import os
import json
import re
import numpy as np
import warnings
from pathlib import Path
from urllib.parse import urlparse
warnings.filterwarnings("ignore")


!pip uninstall -y whisper
!pip install -q openai-whisper


try:
    import yt_dlp
    from moviepy.editor import VideoFileClip
    import imageio_ffmpeg
    import whisper
    import librosa
except ImportError:
    print("Installing required libraries...")
    !pip install -q yt-dlp moviepy imageio-ffmpeg openai-whisper librosa
    import yt_dlp
    from moviepy.editor import VideoFileClip
    import imageio_ffmpeg
    import whisper
    import librosa



try:
    whisper.load_model
except AttributeError:
    print("Fixing whisper installation...")
    import sys
    for mod in list(sys.modules.keys()):
        if mod == 'whisper' or mod.startswith('whisper.'):
            del sys.modules[mod]
    !pip uninstall -y whisper
    !pip install -q --force-reinstall openai-whisper
    import whisper

Found existing installation: whisper 1.1.10
Uninstalling whisper-1.1.10:
  Successfully uninstalled whisper-1.1.10


# class for video processing

In [2]:
class VideoProcessor:
    def __init__(self):
        self.supported_domains = [
            'youtube.com', 'youtu.be', 'loom.com', 'vimeo.com',
            'dropbox.com', 'drive.google.com'
        ]
        self.ydl_opts = {
            'format': 'best[ext=mp4]/best',
            'outtmpl': '%(title)s.%(ext)s',
            'quiet': True,
            'no_warnings': True,
        }

    def validate_url(self, url: str) -> bool:
        try:
            parsed = urlparse(url)
            if parsed.path.endswith(('.mp4', '.avi', '.mov', '.mkv', '.webm')):
                return True
            domain = parsed.netloc.lower()
            return any(supported in domain for supported in self.supported_domains)
        except Exception:
            return False

    def download_video(self, url: str, output_dir: str) -> str:
        try:
            output_template = os.path.join(output_dir, '%(title)s.%(ext)s')
            ydl_opts = {**self.ydl_opts, 'outtmpl': output_template}
            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                info = ydl.extract_info(url, download=False)
                if info.get('duration', 0) > 600:
                    raise Exception("Video too long (max 10 minutes supported)")
                ydl.download([url])
                title = info.get('title', 'video')
                ext = info.get('ext', 'mp4')
                safe_title = re.sub(r'[<>:"/\\|?*]', '_', title)
                video_path = os.path.join(output_dir, f"{safe_title}.{ext}")
                for file in os.listdir(output_dir):
                    if file.endswith(('.mp4', '.webm', '.mkv', '.avi')):
                        actual_path = os.path.join(output_dir, file)
                        if actual_path != video_path:
                            os.rename(actual_path, video_path)
                        return video_path
                return video_path
        except Exception as e:
            raise Exception(f"Failed to download video: {str(e)}")

    def extract_audio(self, video_path: str, output_dir: str) -> str:
        try:
            os.environ['IMAGEIO_FFMPEG_EXE'] = imageio_ffmpeg.get_ffmpeg_exe()

            video_name = Path(video_path).stem
            audio_path = os.path.join(output_dir, f"{video_name}.wav")

            clip = VideoFileClip(video_path)
            clip.audio.write_audiofile(audio_path, fps=16000, nbytes=2, codec='pcm_s16le',
                                     ffmpeg_params=["-ac", "1"], logger=None)
            clip.close()

            if not os.path.exists(audio_path) or os.path.getsize(audio_path) < 1000:
                raise Exception("Audio extraction failed")
            return audio_path
        except Exception as e:
            raise Exception(f"Failed to extract audio: {str(e)}")

# class for audio processing

In [3]:
class AudioAnalyzer:
    def __init__(self, model_size: str = "base"):
        self.model_size = model_size
        self.whisper_model = None

    def load_whisper_model(self):
        if self.whisper_model is None:
            self.whisper_model = whisper.load_model(self.model_size)
        return self.whisper_model

    def transcribe_audio(self, audio_path: str):
        try:
            model = self.load_whisper_model()
            result = model.transcribe(
                audio_path,
                language="en",
                task="transcribe",
                temperature=0.0
            )
            return {'text': result['text'].strip()}
        except Exception as e:
            raise Exception(f"Transcription failed: {str(e)}")

    def get_speech_statistics(self, transcript: str, duration: float):
        try:
            words = transcript.split()
            word_count = len(words)
            wpm = (word_count / duration) * 60 if duration > 0 else 0
            sentence_count = len(re.split(r'[.!?]+', transcript))

            return {
                'word_count': word_count,
                'wpm': wpm,
                'sentence_count': sentence_count
            }
        except Exception:
            return {'word_count': 0, 'wpm': 0, 'sentence_count': 0}

    def extract_acoustic_features(self, audio_path: str):
        try:
            y, sr = librosa.load(audio_path, sr=16000)
            duration = len(y) / sr
            return {'duration': duration}
        except Exception as e:
            raise Exception(f"Feature extraction failed: {str(e)}")

# Accent detection

In [4]:
class AccentDetector:
    def __init__(self):

        self.word_patterns = {
            "American": [
                "awesome", "guys", "totally", "math", "color", "elevator", "apartment", "vacation",
                "sidewalk", "subway", "garbage", "mail", "truck", "dessert", "sneakers", "candy",
                "movie", "cellphone", "garbage", "gas", "soccer", "fall", "parking lot", "line",
                "drapes", "cookies", "closet", "pants", "gotten", "aluminum", "z", "tomato",
                "schedule", "college", "spelled", "center", "defense", "meter", "catalog", "tire"
            ],
            "British": [
                "brilliant", "proper", "cheers", "mate", "quid", "maths", "colour", "lift", "flat",
                "holiday", "jumper", "nappy", "queue", "rubbish", "post", "lorry", "pudding",
                "trainers", "sweets", "film", "mobile", "bin", "petrol", "football", "autumn",
                "car park", "queue", "curtains", "biscuits", "wardrobe", "trousers", "got",
                "aluminium", "zed", "tomahto", "schedule", "university", "spelt", "centre",
                "defence", "metre", "catalogue", "tyre"
            ],
            "Australian": [
                "mate", "reckon", "arvo", "barbie", "footy", "g'day", "ripper", "sheila",
                "chockers", "thongs", "bottle-o", "servo", "ute", "Maccas", "brekkie", "bickie",
                "mozzie", "daggy", "dunny", "esky", "stubby", "sunnies", "lollies", "manchester",
                "docket", "capsicum", "rock melon", "fairy floss", "runners", "singlet"
            ]
        }


        self.phonetic_patterns = {
            "American": [
                r"r\b", r"t(?=\w)", r"a(?=th)", r"(?<!\w)a(?=\s|$)", r"can't\s+even",
                r"(?<!\w)o(?=\w)", r"ot(?=\w)", r"(?<!\w)z\w+", r"(?<!t)ed\b"
            ],
            "British": [
                r"r(?!\w)", r"t\b", r"a(?=ss|st|sk)", r"(?<!\w)ah(?=\s|$)", r"quite", r"rather",
                r"indeed", r"(?<!\w)o\w+", r"(?<!\w)ou\w+", r"(?<!c)ent\b", r"(?<!\w)u(?=\w)",
                r"(?<!\w)t\w+", r"(?<!\w)d\w+"
            ],
            "Australian": [
                r"ai(?=\s|$)", r"ay(?=\s|$)", r"i(?=\s|$)", r"(?<!\w)a\w+", r"(?<!\w)oi\w+",
                r"(?<!\w)ei\w+", r"(?<!\w)ou\w+", r"(?<!\w)g'", r"(?<!\w)strewth"
            ]
        }


        self.grammar_patterns = {
            "American": [
                r"(?<!\w)gotten\b", r"(?<!\w)I\s+already\s+", r"(?<!\w)did\s+you\s+",
                r"(?<!\w)(?:he|she|it)\s+(?:don't|doesn't)\s+"
            ],
            "British": [
                r"(?<!\w)got\b", r"(?<!\w)I've\s+already\s+", r"(?<!\w)have\s+you\s+",
                r"(?<!\w)(?:he|she|it)\s+(?:have|has)n't\s+"
            ],
            "Australian": [
                r"(?<!\w)reckon\b", r"(?<!\w)heaps\s+of\b", r"(?<!\w)too\s+easy\b",
                r"(?<!\w)no\s+worries\b"
            ]
        }


        self.strong_indicators = {
            "American": [
                "I could care less", "awesome sauce", "dude", "y'all", "right off the bat",
                "touch base", "period", "for sure", "my bad", "rain check"
            ],
            "British": [
                "bloody hell", "sorted", "fancy a", "spot on", "cheeky", "knackered",
                "chuffed", "gobsmacked", "blimey", "bloke", "fortnight", "whilst", "proper",
                "innit", "cheers", "rubbish", "brilliant", "quid"
            ],
            "Australian": [
                "fair dinkum", "crikey", "no dramas", "yeah nah", "she'll be right",
                "too easy", "flat out", "heaps good", "how ya going", "good on ya"
            ]
        }

    def analyze_word_usage(self, transcript):
        scores = {}
        transcript_lower = transcript.lower()
        matches_by_accent = {}

        for accent, words in self.word_patterns.items():
            matches = []
            for word in words:
                if re.search(r'\b' + re.escape(word) + r'\b', transcript_lower):
                    matches.append(word)


            count = len(matches)


            score = count / max(5, min(len(words), 15)) if count > 0 else 0

            scores[accent] = score
            matches_by_accent[accent] = matches

        return scores, matches_by_accent

    def analyze_phonetic_patterns(self, transcript):

        scores = {}
        transcript_lower = transcript.lower()
        matches_by_accent = {}

        for accent, patterns in self.phonetic_patterns.items():
            matches = []
            for pattern in patterns:
                found = re.findall(pattern, transcript_lower)
                if found:
                    matches.extend(found[:5])


            score = min(1.0, len(matches) / 10)
            scores[accent] = score
            matches_by_accent[accent] = matches

        return scores, matches_by_accent

    def analyze_grammar(self, transcript):

        scores = {}
        transcript_lower = transcript.lower()

        for accent, patterns in self.grammar_patterns.items():
            matches = []
            for pattern in patterns:
                found = re.findall(pattern, transcript_lower)
                if found:
                    matches.extend(found[:3])


            score = min(1.0, len(matches) / 2)
            scores[accent] = score

        return scores

    def check_strong_indicators(self, transcript):

        scores = {}
        transcript_lower = transcript.lower()

        for accent, phrases in self.strong_indicators.items():
            matches = []
            for phrase in phrases:
                if phrase.lower() in transcript_lower:
                    matches.append(phrase)


            score = min(1.0, len(matches) * 0.8)
            scores[accent] = score

        return scores

    def classify_accent(self, audio_path, transcript):

        word_scores, word_matches = self.analyze_word_usage(transcript)
        phonetic_scores, phonetic_matches = self.analyze_phonetic_patterns(transcript)
        grammar_scores = self.analyze_grammar(transcript)
        indicator_scores = self.check_strong_indicators(transcript)


        analysis_details = {
            "word_scores": word_scores,
            "phonetic_scores": phonetic_scores,
            "grammar_scores": grammar_scores,
            "indicator_scores": indicator_scores,
            "word_matches": word_matches,
            "phonetic_matches": phonetic_matches
        }

        weights = {
            "word": 0.4,
            "phonetic": 0.3,
            "grammar": 0.1,
            "indicator": 0.2
        }

        combined_scores = {}
        for accent in self.word_patterns.keys():
            combined_scores[accent] = (
                weights["word"] * word_scores.get(accent, 0) +
                weights["phonetic"] * phonetic_scores.get(accent, 0) +
                weights["grammar"] * grammar_scores.get(accent, 0) +
                weights["indicator"] * indicator_scores.get(accent, 0)
            )

        british_words = ["class", "bath", "dance", "half", "can't", "laugh", "tomato", "schedule"]
        british_word_pattern = r'\b(' + '|'.join(british_words) + r')\b'
        british_words_in_transcript = re.findall(british_word_pattern, transcript.lower())

        if british_words_in_transcript:
            combined_scores["British"] += 0.2 * min(1.0, len(british_words_in_transcript) / 3)


        if all(score < 0.1 for score in combined_scores.values()):
            top_accent = "Neutral/Unknown"
            confidence = 30.0
        else:
            top_accent = max(combined_scores, key=combined_scores.get)


            top_score = combined_scores[top_accent]


            if top_score < 0.1:
                confidence = 30.0
            else:

                confidence = min(100, 30 + (top_score * 70) ** 0.7 * 100)


            other_scores = [s for a, s in combined_scores.items() if a != top_accent]
            if other_scores:
                next_best = max(other_scores)
                separation = top_score - next_best


                if separation > 0.1:
                    confidence = min(100, confidence + 10)


                if separation < 0.05 and top_score > 0:
                    confidence = max(30, confidence - 10)


            if top_score > 0.3:
                confidence = max(confidence, 70)
            if top_score > 0.5:
                confidence = max(confidence, 85)


        explanations = {
            "American": "Detected American accent based on vocabulary choices (like 'color', 'elevator'), pronunciation patterns, and sentence structure.",
            "British": "Speech exhibits British English characteristics including vocabulary (like 'colour', 'lift'), non-rhotic pronunciation, and distinctive intonation patterns.",
            "Australian": "Voice contains Australian English markers in word choice, rising intonation, and distinctive vowel sounds.",
            "Neutral/Unknown": "No strong accent markers detected or insufficient speech sample."
        }

        return {
            "accent": top_accent,
            "confidence": confidence,
            "explanation": explanations.get(top_accent, "Accent detected based on speech analysis."),
            "analysis_details": analysis_details,
            "raw_scores": combined_scores
        }


In [5]:
def analyze_video_accent(video_url):
    processor = VideoProcessor()

    if not processor.validate_url(video_url):
        return {"error": "Invalid or unsupported URL format"}

    with tempfile.TemporaryDirectory() as tmpdir:
        try:
            print("Downloading video...")
            video_path = processor.download_video(video_url, tmpdir)

            print("Extracting audio...")
            audio_path = processor.extract_audio(video_path, tmpdir)

            print("Transcribing speech...")
            analyzer = AudioAnalyzer()
            transcript_result = analyzer.transcribe_audio(audio_path)

            print("Extracting audio features...")
            features = analyzer.extract_acoustic_features(audio_path)

            print("Analyzing speech statistics...")
            stats = analyzer.get_speech_statistics(
                transcript_result['text'],
                features['duration']
            )

            print("Detecting accent...")
            detector = AccentDetector()
            accent_result = detector.classify_accent(
                audio_path,
                transcript_result['text']
            )


            result = {
                "accent_analysis": accent_result,
                "transcript": transcript_result['text'],
                "speech_stats": stats
            }

            return result

        except Exception as e:
            return {"error": str(e)}

# function to display results

In [6]:
def display_results(results):
    if "error" in results:
        print(f"\nError: {results['error']}")
    else:
        accent = results["accent_analysis"]["accent"]
        confidence = results["accent_analysis"]["confidence"]
        explanation = results["accent_analysis"]["explanation"]

        print("\n=== Accent Detection Result ===")
        print(f"Accent: {accent}")
        print(f"Confidence: {confidence:.2f}%")
        print(f"Explanation: {explanation}")

        print("\n=== Transcript ===")
        print(results["transcript"])

        print("\n=== Speech Stats ===")
        print(f"Words per Minute: {results['speech_stats']['wpm']:.2f}")
        print(f"Word Count: {results['speech_stats']['word_count']}")


        if "raw_scores" in results["accent_analysis"]:
            print("\n=== Raw Accent Scores ===")
            for accent, score in results["accent_analysis"]["raw_scores"].items():
                print(f"  {accent}: {score:.4f}")

        if "analysis_details" in results["accent_analysis"]:
            print("\n=== Accent Analysis Details ===")
            details = results["accent_analysis"]["analysis_details"]

            print("Word Scores:")
            for accent, score in details["word_scores"].items():
                print(f"  {accent}: {score:.3f}")

            print("\nPhonetic Scores:")
            for accent, score in details["phonetic_scores"].items():
                print(f"  {accent}: {score:.3f}")

            print("\nGrammar Scores:")
            for accent, score in details["grammar_scores"].items():
                print(f"  {accent}: {score:.3f}")

            print("\nStrong Indicator Scores:")
            for accent, score in details["indicator_scores"].items():
                print(f"  {accent}: {score:.3f}")

            if details["word_matches"]:
                for accent, matches in details["word_matches"].items():
                    if matches:
                        print(f"\n{accent} Word Matches:", ", ".join(matches))



In [7]:
video_url = "https://www.youtube.com/watch?v=owlHiqL3nAI"


results = analyze_video_accent(video_url)
display_results(results)

Downloading video...
Extracting audio...
Transcribing speech...
Extracting audio features...
Analyzing speech statistics...
Detecting accent...

=== Accent Detection Result ===
Accent: American
Confidence: 100.00%
Explanation: Detected American accent based on vocabulary choices (like 'color', 'elevator'), pronunciation patterns, and sentence structure.

=== Transcript ===
Hey everyone, welcome to another Speak English with me video. In this video you will practice you're speaking with me. And as usual we will have a dialogue where one line will be mine and the next one will be yours. I'm going to say my line and then you will read your line out loud from the screen as if you were answering me and then vice versa. This is a very convenient and effective way to improve your speaking skills in English by yourself. Today's dialogue takes place in a movie theater. First we will listen to the full dialogue so that you know how to pronounce certain words and then we'll move to the practicing