In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import nltk
import spacy
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
nltk.download("punkt")
nltk.download("stopwords")
from spacy import displacy
from PIL import Image
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt
import tensorflow as tf
# Check if TensorFlow is already initialized
if tf.test.is_built_with_cuda():
    print("TensorFlow is already initialized with CUDA support")
else:
    # Initialize TensorFlow or other libraries here
    pass
import plotly.express as px
import plotly.graph_objects as go
from wordcloud import WordCloud

Loading the data

In [None]:
path = "/kaggle/input/friends-tv-show-script/Friends_Transcript.txt"
text = open(path, 'r').read()


Sample of the transcript

In [None]:
text[:1000]

What are the most frequently used words or phrases?

In [None]:
nlp = spacy.load("en_core_web_sm")
stopword = nltk.corpus.stopwords.words('english')

def textCleaning(text):
    
    text = re.sub(r'[^\w\s]', '', str(text))
    text = re.split("\W+", text)
    text = [word for word in text if word not in stopword]
    text = ' '.join(text)
    return text


def wordFrequency(text):
    cleanText = textCleaning(text)
    split_text = pd.DataFrame(cleanText.split(), columns=["Words"])
    split_text = split_text.value_counts()[:2500].reset_index(drop=False)[:2500]
    split_text.columns = ["Words", "Count"]
    return split_text

In [None]:
frequentWords = wordFrequency(text)
frequentWords[:10].style.background_gradient(cmap="Blues")

In [None]:
fig = px.funnel(frequentWords[:10], x="Count", y="Words")
fig.show()

In [None]:
words = []
for i in frequentWords.Words:
    words.append(str(i))

In [None]:
wordcloud = WordCloud(width=250, height=250,
                     background_color="black",
                     stopwords = stopword,
                     min_font_size = 10).generate(' '.join(words))

plt.figure(figsize=(8,8), facecolor=None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()

How do different characters contribute to the dialogue?

Analyze the distribution of dialogue among characters to see who speaks the most and their role in the series.

In [None]:
name_list = ['Joey','Monica','Phoebe','Chandler','Ross','Rachel']
scripts = []
split_string = text.split()
for name in name_list:
    scripts.append((name,split_string.count(name)))

In [None]:
colors = ['#2F86A6','#34BE82','#2FDD92','#F2F013','#F9975D','#F4E185']
sections = [scripts[0][1],
            scripts[1][1],
            scripts[2][1],
            scripts[3][1],
           scripts[4][1],
           scripts[5][1]]
plt.figure(figsize=(14, 8), dpi=75)
plt.pie(sections, labels=name_list,colors=colors, 
        wedgeprops=dict( alpha=1),
        startangle=90,
        #explode = (0,0,0,0),
        autopct = '%0.1f%%',
         textprops={
                'fontsize': 15, 
                'fontweight': 'normal'}
            )

plt.axis('equal')
plt.title('Script Count',fontsize=20)
plt.show()

What are the key topics or themes discussed in the series?

In [None]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Join tokens back into string
    return ' '.join(tokens)

# Preprocess the transcript
cleaned_transcript = preprocess_text(text)

In [None]:
# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform([cleaned_transcript])

In [None]:
# Perform topic modeling
num_topics = 5
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda.fit(X)

In [None]:
# Display the top words for each topic
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic #{topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
    print()

tfidf_feature_names = vectorizer.get_feature_names_out()
display_topics(lda, tfidf_feature_names, no_top_words=10)

In [None]:
pip install vaderSentiment

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()
def preprocess_and_split(text):
    # Split by multiple newlines
    segments = re.split(r'\n{2,}', text)
    return segments

segments = preprocess_and_split(text)

def analyze_sentiment(text):
    return analyzer.polarity_scores(text)

sentiments = [analyze_sentiment(segment) for segment in segments]