Data source: https://www.kaggle.com/code/julqka/gilmore-girls-network-analysis?select=Gilmore_Girls_Lines.csv

In [1]:
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

In [2]:
import pandas as pd 
import nltk
import warnings
warnings.filterwarnings('ignore')

nltk.download('punkt_tab')  # Download the sentence tokenizer
nltk.download('wordnet')  # Download WordNet, required for semantic analysis for lemmatization
nltk.download('stopwords')

nltk.download('omw-1.4')  # Download the Open Multilingual Wordnet corpus for multilingual semantic analysis.

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/asthapuri/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/asthapuri/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/asthapuri/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/asthapuri/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
df = pd.read_csv("Gilmore_Girls_Lines.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Character,Line,Season
0,0,Lorelai,"Please, Luke. Please, please, please.",1
1,1,Luke,How many cups have you had this morning?,1
2,2,Lorelai,None.,1
3,3,Luke,Plus...,1
4,4,Lorelai,"Five, but yours is better.",1


In [5]:
lorelai = df[df['Character']=='Lorelai']

In [6]:
lorelai.head()

Unnamed: 0.1,Unnamed: 0,Character,Line,Season
0,0,Lorelai,"Please, Luke. Please, please, please.",1
2,2,Lorelai,None.,1
4,4,Lorelai,"Five, but yours is better.",1
6,6,Lorelai,"Yes, I do.",1
8,8,Lorelai,"Angel. You've got wings, baby.",1


In [7]:
lorelai_dialog = ' '.join(lorelai['Line'])

In [8]:
rory = df[df['Character']=='Rory']

In [9]:
rory.head()

Unnamed: 0.1,Unnamed: 0,Character,Line,Season
30,30,Rory,Hey. It's freezing.,1
32,32,Rory,Lip gloss.,1
35,35,Rory,Anything in there not resembling a breakfast c...,1
38,38,Rory,"God, RuPaul doesn't need this much makeup.",1
40,40,Rory,I'm sorry. I lost my Macy Gray CD and I need c...,1


In [10]:
rory_dialog = ' '.join(rory['Line'])

In [11]:
corpus = [lorelai_dialog, rory_dialog]

### Tokenization

In [12]:
#tokenization
from nltk.tokenize import word_tokenize
tokenized_docs = []
for doc in corpus:
  tokens = word_tokenize(doc)
  tokenized_docs.append(tokens)

### Lowercasing

In [13]:
# Convert tokens to lowercase
lowercased_docs = []
for tokens in tokenized_docs:
    lowercased_tokens = [token.lower() for token in tokens]
    lowercased_docs.append(lowercased_tokens)

### Remove punctuation

In [14]:
# Remove punctuation
no_punctuation_docs = []
for tokens in lowercased_docs:
    filtered_tokens = [token for token in tokens if token.isalpha()]  # Remove punctuation
    no_punctuation_docs.append(filtered_tokens)

### Stopwords removal

In [15]:
# Stop Words Removal
from nltk.corpus import stopwords
# Get the English stopwords
stop_words = set(stopwords.words('english'))

# Remove stopwords
no_stopwords_docs = []
for tokens in no_punctuation_docs:
    filtered_tokens = [token for token in tokens if token not in stop_words]
    no_stopwords_docs.append(filtered_tokens)

In [16]:
# Add 'also' to the set of stopwords
stop_words.add('also')

# Remove stopwords again, now including 'also'
no_stopwords_including_also_docs = []
for tokens in no_stopwords_docs:  # We'll start from the already stopword-removed docs
    filtered_tokens = [token for token in tokens if token not in stop_words]
    no_stopwords_including_also_docs.append(filtered_tokens)

### Stemming

In [18]:
from nltk.stem import PorterStemmer

# Initialize the Porter Stemmer
stemmer = PorterStemmer()

# Apply stemming to the documents
stemmed_docs = []
for tokens in no_stopwords_including_also_docs:
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    stemmed_docs.append(stemmed_tokens)

### Advantages-
- Faster and simpler: It's computationally less expensive than other techniques like lemmatization.
- Reduces vocabulary size: By grouping words with similar stems, it helps manage large datasets more efficiently.

### Disadvantages
- The most prevelant one is Oversimplification: Can lead to incorrect stems, like transforming "cars" and "care" into the same stem "car".
- Meaning loss: Removing suffixes can alter the word's meaning, causing confusion.
- Not context-aware: Doesn't consider different grammatical roles a word can play within a sentence.


### Lemmatization

In [27]:
#!python -m spacy download en_core_web_lg 

In [26]:
import spacy

# Load the English language model with lemmatization capabilities

nlp = spacy.load("en_core_web_lg")

# Apply lemmatization using spaCy
spacy_lemmatized_docs = []
for doc in no_stopwords_including_also_docs:
    # Process the document using spaCy
    spacy_doc = nlp(' '.join(doc))
    # Extract lemmatized tokens
    spacy_lemmatized_tokens = [token.lemma_ for token in spacy_doc]
    spacy_lemmatized_docs.append(spacy_lemmatized_tokens)

### Advantages:
- Preserves Meaning: Lemmatization aims to find the dictionary form of a word, resulting in a base form that still retains its meaning. Stemming, on the other hand, simply chops off suffixes without considering context, potentially leading to meaningless stems like "lov" for "loved" or "walk" for "walker."
- More Accurate: Lemmatization utilizes dictionaries and linguistic information to choose the correct base form based on the word's part-of-speech and context. Stemming is rule-based and can produce ambiguous results, especially for words with multiple meanings or non-standard forms.
- Improves downstream tasks: The accuracy of lemmatization leads to better performance in tasks like information retrieval, text classification, and sentiment analysis because words with similar meanings are grouped correctly. Stemming's less accurate results can negatively impact these tasks.

### Disadvantages:
- Slower and computationally expensive: Lemmatization requires accessing dictionaries and performing more complex analysis, making it slower than stemming, which uses simple rules.
- Not always perfect: Even lemmatization can make mistakes in complex cases or due to limitations in dictionaries and linguistic models.

### Vocabulary Building

In [28]:
# Initialize an empty set to store the vocabulary
vocabulary = set()

# Iterate over each document and add each unique word to the vocabulary set
for doc in spacy_lemmatized_docs:
    for token in doc:
        vocabulary.add(token)

# Convert the set to a sorted list if you want the vocabulary to be ordered
vocabulary = sorted(list(vocabulary))

### Vectorization

In [31]:
# import necessary library
from sklearn.feature_extraction.text import CountVectorizer

In [32]:
# Convert the lemmatized documents back into string format for vectorization
documents = [" ".join(doc) for doc in spacy_lemmatized_docs]

### One hot encoding

In [33]:
vectorizer_ohe = CountVectorizer(vocabulary=vocabulary, binary=True)

In [34]:
# Fit and transform the documents
X = vectorizer_ohe.fit_transform(documents)
 
# Get the feature names (vocabulary)
feature_names = vectorizer_ohe.get_feature_names_out()

# Display the feature names and the shape of the document-term matrix
print("Feature Names:", feature_names)
print("Document-Term Matrix Shape:", X.shape)

# If you want to see the frequency count of each word in the document
frequency_counts = X.toarray()
print("Frequency Counts:\n", frequency_counts)

Feature Names: ['aa' 'aaaah' 'aaagh' ... 'zurich' 'zydeco' 'zzz']
Document-Term Matrix Shape: (2, 13298)
Frequency Counts:
 [[0 1 1 ... 0 1 1]
 [1 0 0 ... 1 1 0]]


In [35]:
pd.DataFrame(X.toarray(), columns=[feature_names])

Unnamed: 0,aa,aaaah,aaagh,aaah,aaaww,aah,aahh,aalgonquin,aaron,ab,...,zombie,zone,zoo,zoom,zsa,zucchini,zucker,zurich,zydeco,zzz
0,0,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,0,1,1
1,1,0,0,0,0,1,0,0,0,0,...,0,1,1,0,1,0,0,1,1,0


### Count Vectorization

In [36]:
# Initialize CountVectorizer
count_vectorizer = CountVectorizer(vocabulary=vocabulary)

In [37]:
# Fit and transform the documents
X = count_vectorizer.fit_transform(documents)
 
# Get the feature names (vocabulary)
feature_names = count_vectorizer.get_feature_names_out()

# Display the feature names and the shape of the document-term matrix
print("Feature Names:", feature_names)
print("Document-Term Matrix Shape:", X.shape)

# If you want to see the frequency count of each word in the document
frequency_counts = X.toarray()
print("Frequency Counts:\n", frequency_counts)

Feature Names: ['aa' 'aaaah' 'aaagh' ... 'zurich' 'zydeco' 'zzz']
Document-Term Matrix Shape: (2, 13298)
Frequency Counts:
 [[0 1 1 ... 0 1 1]
 [1 0 0 ... 2 3 0]]


In [38]:
pd.DataFrame(X.toarray(), columns=[feature_names])

Unnamed: 0,aa,aaaah,aaagh,aaah,aaaww,aah,aahh,aalgonquin,aaron,ab,...,zombie,zone,zoo,zoom,zsa,zucchini,zucker,zurich,zydeco,zzz
0,0,1,1,1,1,10,1,1,1,2,...,1,5,2,1,2,13,1,0,1,1
1,1,0,0,0,0,5,0,0,0,0,...,0,6,2,0,2,0,0,2,3,0
