In [1]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from collections import Counter
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

# Download stopwords
nltk.download('stopwords')

# Load dataset
file_path = "IPL_Match_Highlights_Commentary.csv"
df = pd.read_csv(file_path)

# Extract Commentary column
df = df[['Commentary']].dropna()

# Preprocess text function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    words = text.split()  # Tokenization
    words = [word for word in words if word not in stopwords.words('english')]  # Remove stopwords
    return words

# Apply preprocessing
df['Cleaned_Commentary'] = df['Commentary'].apply(preprocess_text)

# Get unique words
all_words = set(word for words in df['Cleaned_Commentary'] for word in words)

# Compute Term Frequency (TF)
def compute_tf(text):
    word_counts = Counter(text)
    total_words = len(text)
    return {word: word_counts[word] / total_words for word in word_counts}

df['TF'] = df['Cleaned_Commentary'].apply(compute_tf)

# Compute Document Frequency (DF)
df_list = df['Cleaned_Commentary'].tolist()
doc_count = len(df_list)
df_counts = {word: sum(1 for text in df_list if word in text) for word in all_words}

# Compute Inverse Document Frequency (IDF)
idf_values = {word: np.log(doc_count / (df_counts[word] + 1)) for word in all_words}

# Compute TF-IDF
def compute_tfidf(tf_dict):
    return {word: tf_dict[word] * idf_values[word] for word in tf_dict}

df['TF-IDF'] = df['TF'].apply(compute_tfidf)

# Convert TF-IDF dictionary to DataFrame
tfidf_df = pd.DataFrame(df['TF-IDF'].tolist()).fillna(0)

# -------------------- Sklearn TF-IDF for Comparison --------------------
vectorizer = TfidfVectorizer()
sklearn_tfidf = vectorizer.fit_transform(df['Commentary'].astype(str))
sklearn_df = pd.DataFrame(sklearn_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

# Display first 5 rows of both TF-IDF implementations
print("\n🔹 TF-IDF from Scratch (First 5 rows):")
print(tfidf_df.head())

print("\n🔹 Sklearn TF-IDF (First 5 rows):")
print(sklearn_df.head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\daksh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



🔹 TF-IDF from Scratch (First 5 rows):
      nehra   mandeep      four     first  boundary       rcb      full  \
0  0.283486  0.697085  0.038342  0.151717  0.125469  0.189348  0.099640   
1  0.384730  0.315348  0.026018  0.102951  0.000000  0.000000  0.000000   
2  0.000000  0.000000  0.031674  0.000000  0.000000  0.000000  0.000000   
3  0.489657  0.000000  0.033114  0.000000  0.000000  0.000000  0.086052   
4  0.347498  0.000000  0.047000  0.000000  0.000000  0.000000  0.000000   

       pads   needed       put  ...  uhoh  latent  wrath  microsix  whodve  \
0  0.176715  0.20101  0.203775  ...   0.0     0.0    0.0       0.0     0.0   
1  0.000000  0.00000  0.000000  ...   0.0     0.0    0.0       0.0     0.0   
2  0.000000  0.00000  0.000000  ...   0.0     0.0    0.0       0.0     0.0   
3  0.000000  0.00000  0.000000  ...   0.0     0.0    0.0       0.0     0.0   
4  0.000000  0.00000  0.000000  ...   0.0     0.0    0.0       0.0     0.0   

   outunorthodox  paddlepulls  expresspac

In [12]:
# Manually defining a list of English stopwords
custom_stopwords = [
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours",
    "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself",
    "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this",
    "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being",
    "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the",
    "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for",
    "with", "about", "against", "between", "into", "through", "during", "before", "after",
    "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under"
]

# Example usage to remove stopwords from text
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in custom_stopwords]
    return " ".join(filtered_words)

# Test example
text = "This is an example sentence to test stopwords removal."
filtered_text = remove_stopwords(text)
print("Original:", text)
print("Filtered:", filtered_text)


Original: This is an example sentence to test stopwords removal.
Filtered: example sentence test stopwords removal.


In [13]:
import pandas as pd
import numpy as np
import re
import string
from collections import Counter
import math
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Download stopwords if not available
nltk.download('stopwords')
nltk.download('punkt')

# Load the CSV file
file_path = "IPL_Match_Highlights_Commentary.csv"  # Update the path if needed
df = pd.read_csv(file_path)

# Select only the 'Commentary' column and drop NaN values
df = df[['Commentary']].dropna()

# Convert to a list of comments
documents = df['Commentary'].tolist()

# Preprocessing Function
def preprocess(text):
    text = text.lower()  # Lowercasing
    text = re.sub(f"[{string.punctuation}]", "", text)  # Remove punctuation
    words = word_tokenize(text)  # Tokenize words
    words = [word for word in words if word not in stopwords.words('english')]  # Remove stopwords
    return words

# Apply preprocessing to each document
preprocessed_docs = [preprocess(doc) for doc in documents]

# Create vocabulary
vocabulary = set(word for doc in preprocessed_docs for word in doc)

# Compute Term Frequency (TF)
def compute_tf(doc):
    word_counts = Counter(doc)
    total_words = len(doc)
    return {word: word_counts[word] / total_words for word in word_counts}

tf_values = [compute_tf(doc) for doc in preprocessed_docs]

# Compute Inverse Document Frequency (IDF)
def compute_idf(docs, vocab):
    num_docs = len(docs)
    idf_values = {}
    for word in vocab:
        containing_docs = sum(1 for doc in docs if word in doc)
        idf_values[word] = math.log(num_docs / (1 + containing_docs))  # Using log smoothing
    return idf_values

idf_values = compute_idf(preprocessed_docs, vocabulary)

# Compute TF-IDF for each document
tfidf_values = [{word: tf[word] * idf_values[word] for word in tf} for tf in tf_values]

# Convert TF-IDF to DataFrame
tfidf_df = pd.DataFrame(tfidf_values).fillna(0)
print("TF-IDF from Scratch:")
print(tfidf_df.head())

# Save TF-IDF results to CSV
tfidf_df.to_csv("tfidf_from_scratch.csv", index=False)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\daksh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\daksh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\daksh/nltk_data'
    - 'C:\\ProgramData\\anaconda3\\nltk_data'
    - 'C:\\ProgramData\\anaconda3\\share\\nltk_data'
    - 'C:\\ProgramData\\anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\daksh\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert raw text documents to TF-IDF matrix using Scikit-learn
vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)
tfidf_matrix = vectorizer.fit_transform(df['Commentary'].dropna())

# Convert to DataFrame
tfidf_sklearn_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

print("TF-IDF using Scikit-learn:")
print(tfidf_sklearn_df.head())

# Save TF-IDF results from Scikit-learn to CSV
tfidf_sklearn_df.to_csv("tfidf_sklearn.csv", index=False)


TF-IDF using Scikit-learn:
   000   07   10  100  1000  100kph  100ks  100th  101  101kph  ...  zipping  \
0  0.0  0.0  0.0  0.0   0.0     0.0    0.0    0.0  0.0     0.0  ...      0.0   
1  0.0  0.0  0.0  0.0   0.0     0.0    0.0    0.0  0.0     0.0  ...      0.0   
2  0.0  0.0  0.0  0.0   0.0     0.0    0.0    0.0  0.0     0.0  ...      0.0   
3  0.0  0.0  0.0  0.0   0.0     0.0    0.0    0.0  0.0     0.0  ...      0.0   
4  0.0  0.0  0.0  0.0   0.0     0.0    0.0    0.0  0.0     0.0  ...      0.0   

   zips  zone  zones  zoning  zoomed  zoomer  zooming  zooms  zoots  
0   0.0   0.0    0.0     0.0     0.0     0.0      0.0    0.0    0.0  
1   0.0   0.0    0.0     0.0     0.0     0.0      0.0    0.0    0.0  
2   0.0   0.0    0.0     0.0     0.0     0.0      0.0    0.0    0.0  
3   0.0   0.0    0.0     0.0     0.0     0.0      0.0    0.0    0.0  
4   0.0   0.0    0.0     0.0     0.0     0.0      0.0    0.0    0.0  

[5 rows x 9147 columns]


In [15]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\daksh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [16]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\daksh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [23]:
import pandas as pd

# Load the dataset
df = pd.read_csv('IPL_Match_Highlights_Commentary.csv')

# Display column names
print(df.columns)


Index(['Match_id', 'Team', 'Over_num', 'Commentary', 'batsman', 'score'], dtype='object')


In [24]:
import pandas as pd
import numpy as np
import math
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the dataset
df = pd.read_csv('IPL_Match_Highlights_Commentary.csv')

# Display column names to identify the correct column
print("Columns in dataset:", df.columns)

# Identify the correct text column
txt_column = None
for col in df.columns:
    if df[col].dtype == 'object':  # Assuming commentary is text-based
        txt_column = col
        break

if txt_column is None:
    raise ValueError("No suitable text column found in dataset")

# Extract text data
documents = df[txt_column].dropna().tolist()

# Tokenization function
def tokenize(text):
    return text.lower().split()

# Compute TF (Term Frequency)
def compute_tf(doc):
    word_counts = Counter(tokenize(doc))
    total_words = len(tokenize(doc))
    return {word: count / total_words for word, count in word_counts.items()}

# Compute IDF (Inverse Document Frequency)
def compute_idf(docs):
    N = len(docs)
    idf_values = {}
    all_words = set(word for doc in docs for word in tokenize(doc))
    
    for word in all_words:
        df_t = sum(1 for doc in docs if word in tokenize(doc))
        idf_values[word] = math.log((N + 1) / (df_t + 1)) + 1  # Smoothed IDF
    
    return idf_values

# Compute TF-IDF
def compute_tfidf(docs):
    idf_values = compute_idf(docs)
    tfidf_docs = []
    
    for doc in docs:
        tf = compute_tf(doc)
        tfidf = {word: tf[word] * idf_values[word] for word in tf}
        tfidf_docs.append(tfidf)
    
    return tfidf_docs

# Compute TF-IDF manually
tfidf_manual = compute_tfidf(documents)

# Compare with Scikit-learn's TfidfVectorizer
vectorizer = TfidfVectorizer()
tfidf_sklearn = vectorizer.fit_transform(documents)
feature_names = vectorizer.get_feature_names_out()

# Convert to DataFrame for easy comparison
tfidf_sklearn_df = pd.DataFrame(tfidf_sklearn.toarray(), columns=feature_names)

# Display results
print("TF-IDF from scratch (first document):", tfidf_manual[0])
print("\nTF-IDF using Scikit-learn (first document):\n", tfidf_sklearn_df.iloc[0])


Columns in dataset: Index(['Match_id', 'Team', 'Over_num', 'Commentary', 'batsman', 'score'], dtype='object')
TF-IDF from scratch (first document): {'rcb': 1.0673291740135906, '1st': 0.3333333333333333, 'inns': 0.3333333333333333}

TF-IDF using Scikit-learn (first document):
 1st     0.285683
csk     0.000000
dc      0.000000
gl      0.000000
inns    0.285683
kkr     0.000000
kxip    0.000000
mi      0.000000
rcb     0.914752
rps     0.000000
rr      0.000000
srh     0.000000
Name: 0, dtype: float64


In [26]:
import pandas as pd
import numpy as np
import math
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
print("Loading dataset...")
df = pd.read_csv('IPL_Match_Highlights_Commentary.csv')
print("Identifying the text column...")
print("Columns in dataset:", df.columns)
txt_column = next((col for col in df.columns if df[col].dtype == 'object'), None)
if txt_column is None:
    raise ValueError("No suitable text column found in dataset")
print(f"Using '{txt_column}' as the text column.")
documents = df[txt_column].dropna().tolist()
def tokenize(text):
    return text.lower().split()
def compute_tf(doc):
    words = tokenize(doc)
    word_counts = Counter(words)
    total_words = len(words)
    return {word: count / total_words for word, count in word_counts.items()}
def compute_idf(docs):
    N = len(docs)
    idf_values = {}
    unique_words = set(word for doc in docs for word in tokenize(doc))
    
    for word in unique_words:
        df_t = sum(1 for doc in docs if word in tokenize(doc))
        idf_values[word] = math.log((N + 1) / (df_t + 1)) + 1  # Smoothed IDF
    
    return idf_values
def compute_tfidf(docs):
    print("Computing TF-IDF manually...")
    idf_values = compute_idf(docs)
    tfidf_docs = []
    
    for doc in docs:
        tf = compute_tf(doc)
        tfidf = {word: tf[word] * idf_values[word] for word in tf}
        tfidf_docs.append(tfidf)
    
    return tfidf_docs
tfidf_manual = compute_tfidf(documents)
print("Computing TF-IDF using Scikit-learn...")
vectorizer = TfidfVectorizer()
tfidf_sklearn = vectorizer.fit_transform(documents)
feature_names = vectorizer.get_feature_names_out()
tfidf_sklearn_df = pd.DataFrame(tfidf_sklearn.toarray(), columns=feature_names)
print("TF-IDF from scratch (first document):", tfidf_manual[0])
print("\nTF-IDF using Scikit-learn (first document):\n", tfidf_sklearn_df.iloc[0])


Loading dataset...
Identifying the text column...
Columns in dataset: Index(['Match_id', 'Team', 'Over_num', 'Commentary', 'batsman', 'score'], dtype='object')
Using 'Team' as the text column.
Computing TF-IDF manually...
Computing TF-IDF using Scikit-learn...
TF-IDF from scratch (first document): {'rcb': 1.0673291740135906, '1st': 0.3333333333333333, 'inns': 0.3333333333333333}

TF-IDF using Scikit-learn (first document):
 1st     0.285683
csk     0.000000
dc      0.000000
gl      0.000000
inns    0.285683
kkr     0.000000
kxip    0.000000
mi      0.000000
rcb     0.914752
rps     0.000000
rr      0.000000
srh     0.000000
Name: 0, dtype: float64
