# Setup the environment

In [None]:
file_to_analyze = 'Courses/English-to-کوردی سۆرانی at A1-level Lesson Scripts (Tourism)/Lesson Distribution.json'

In [None]:
import json
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from collections import Counter
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from wordcloud import WordCloud

In [None]:
# Load the JSON data
with open(file_to_analyze, 'r', encoding="utf-8") as file:
    data = json.load(file)

# Helper function to flatten lists
def flatten(lst):
    return [item for sublist in lst for item in sublist]

In [None]:
# Extract all lessons from all topics
all_lessons = flatten([topic['lessons'] for topic in data])

total_unique_words = []
total_unique_phrases = []
total_unique_concepts = []
for lesson in all_lessons:
    words = lesson.get('words', [])
    for word in words:
      if word not in total_unique_words:
        total_unique_words.append(word)
    phrases = lesson.get('phrases', [])
    for phrase in phrases:
      if phrase not in total_unique_phrases:
        total_unique_phrases.append(phrase)
    concepts = lesson.get('concepts', [])
    for concept in concepts:
      if concept not in total_unique_concepts:
        total_unique_concepts.append(concept)
print(f"Total unique words: {len(total_unique_words)}")
print(f"Total unique phrases: {len(total_unique_phrases)}")
print(f"Total unique concepts: {len(total_unique_concepts)}")
normal_lessons = [lesson for lesson in all_lessons if lesson['type'] == 'normal']

# Basic word/concept/phrase graphs

In [None]:
# Word count per lesson
plt.figure(figsize=(12, 6))
word_counts = [len(lesson.get('words', [])) for lesson in all_lessons]
plt.bar(range(1, len(word_counts) + 1), word_counts)
plt.title('Word Count per Lesson')
plt.xlabel('Lesson Number')
plt.ylabel('Word Count')
plt.show()

In [None]:
# Concept count per lesson
plt.figure(figsize=(12, 6))
concept_counts = [len(lesson.get('concepts', [])) for lesson in all_lessons]
plt.bar(range(1, len(concept_counts) + 1), concept_counts)
plt.title('Concept Count per Lesson')
plt.xlabel('Lesson Number')
plt.ylabel('Concept Count')
plt.show()

In [None]:
# Word count distribution across topics
topic_word_counts = [sum(len(lesson.get('words', [])) for lesson in topic['lessons']) for topic in data]
plt.figure(figsize=(12, 6))
plt.bar(range(1, len(topic_word_counts) + 1), topic_word_counts)
plt.title('Total Word Count per Topic')
plt.xlabel('Topic Number')
plt.ylabel('Total Word Count')
plt.show()

In [None]:
# Cumulative word count over lessons
cumulative_words = np.cumsum(word_counts)
plt.figure(figsize=(12, 6))
plt.plot(range(1, len(cumulative_words) + 1), cumulative_words, marker='o')
plt.title('Cumulative Word Count over Lessons')
plt.xlabel('Lesson Number')
plt.ylabel('Cumulative Word Count')
plt.show()

In [None]:
# Word length distribution
all_words = flatten([lesson.get('words', []) for lesson in all_lessons])
word_lengths = [len(word) for word in all_words]
plt.figure(figsize=(12, 6))
sns.histplot(word_lengths, kde=True, bins=20)
plt.title('Distribution of Word Character Lengths')
plt.xlabel('Word Length')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Concept Complexity (assuming longer concepts are more complex)
concept_lengths = [len(concept.split()) for lesson in all_lessons for concept in lesson.get('concepts', [])]
plt.figure(figsize=(12, 6))
sns.histplot(concept_lengths, kde=True, bins=20)
plt.title('Distribution of Concept Complexity')
plt.xlabel('Number of Words in Concept')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Phrase length distribution
all_phrases = flatten([lesson.get('phrases', []) for lesson in all_lessons])
phrase_lengths = [len(phrase.split()) for phrase in all_phrases]
plt.figure(figsize=(12, 6))
sns.histplot(phrase_lengths, kde=True, bins=20)
plt.title('Distribution of Phrase Lengths')
plt.xlabel('Phrase Length (words)')
plt.ylabel('Frequency')
plt.show()

# Word/Concept/Phrase Analysis



In [None]:
# Word frequency across all lessons
all_words = flatten([lesson.get('words', []) for lesson in all_lessons])
word_freq = Counter(all_words)
plt.figure(figsize=(12, 6))
plt.bar(word_freq.keys(), word_freq.values())
plt.title('Word Frequency Across All Lessons')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
# Top 20 Most Common Words
all_words = [word for lesson in all_lessons for word in lesson.get('words', [])]
word_freq = Counter(all_words)
plt.figure(figsize=(12, 6))
pd.Series(word_freq).nlargest(20).plot(kind='bar')
plt.title('Top 20 Most Common Words')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Concept Frequency Across Lessons
all_concepts = [concept for lesson in all_lessons for concept in lesson.get('concepts', [])]
concept_freq = Counter(all_concepts)
plt.figure(figsize=(12, 6))
pd.Series(concept_freq).nlargest(20).plot(kind='bar')
plt.title('Top 20 Most Frequent Concepts')
plt.xlabel('Concepts')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Phrase Types (based on first word)
phrase_types = [phrase.split()[0].lower() for phrase in all_phrases]
plt.figure(figsize=(12, 6))
pd.Series(Counter(phrase_types)).nlargest(10).plot(kind='bar')
plt.title('Top 10 Phrase Types (Based on First Word)')
plt.xlabel('First Word of Phrase')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Word cloud of all words
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(all_words))
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of All Words')
plt.show()

In [None]:
# Concept Types Word Cloud
all_concepts = [concept for lesson in all_lessons for concept in lesson.get('concepts', [])]
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(all_concepts))
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Concepts')
plt.show()

In [None]:
# Concept similarity across lessons
topic_texts = [topic['topic'] for topic in data]
vectorizer = CountVectorizer().fit_transform(topic_texts)
cosine_sim = cosine_similarity(vectorizer)

plt.figure(figsize=(10, 8))
sns.heatmap(cosine_sim, annot=False, cmap='YlGnBu')
plt.title('Topic Similarity Throughout Course')
plt.xlabel('Topic Number')
plt.ylabel('Topic Number')
plt.show()

# Review Items Analysis

In [None]:
# Helper function to get review items
def get_review_items(lesson, item_type):
    return lesson.get(f'{item_type}ToReview', [])

# Collect review data
review_data = []
for i, lesson in enumerate(normal_lessons, 1):
    words = get_review_items(lesson, 'words')
    concepts = get_review_items(lesson, 'concepts')
    phrases = get_review_items(lesson, 'phrases')
    review_data.append({
        'lesson': i,
        'words': len(words),
        'concepts': len(concepts),
        'phrases': len(phrases),
        'total': len(words) + len(concepts) + len(phrases)
    })

# Create a DataFrame
df = pd.DataFrame(review_data)

# Number of review items per lesson
plt.figure(figsize=(12, 6))
df.plot(x='lesson', y=['words', 'concepts', 'phrases'], kind='bar', stacked=True)
plt.title('Number of Review Items per Lesson')
plt.xlabel('Lesson Number')
plt.ylabel('Number of Review Items')
plt.legend(title='Item Type')
plt.tight_layout()
plt.show()

In [None]:
# Distribution of review item types
plt.figure(figsize=(8, 8))
plt.pie(df[['words', 'concepts', 'phrases']].sum(), labels=['Words', 'Concepts', 'Phrases'], autopct='%1.1f%%')
plt.title('Distribution of Review Item Types')
plt.show()

In [None]:
# Heatmap of review items across lessons
plt.figure(figsize=(12, 8))
sns.heatmap(df[['words', 'concepts', 'phrases']], cmap='YlOrRd', annot=True, fmt='d')
plt.title('Heatmap of Review Items Across Lessons')
plt.xlabel('Item Type')
plt.ylabel('Lesson Number')
plt.show()

In [None]:
# Cumulative review items over lessons
plt.figure(figsize=(12, 6))
df['cumulative_total'] = df['total'].cumsum()
plt.plot(df['lesson'], df['cumulative_total'], marker='o')
plt.title('Cumulative Review Items Over Lessons')
plt.xlabel('Lesson Number')
plt.ylabel('Cumulative Number of Review Items')
plt.show()

In [None]:
# Review density across lessons
plt.figure(figsize=(12, 6))
sns.kdeplot(data=df, x='lesson', y='total', cmap='YlOrRd', shade=True)
plt.title('Review Density Across Lessons')
plt.xlabel('Lesson Number')
plt.ylabel('Number of Review Items')
plt.show()