In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
import nltk

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load CSV
df = pd.read_csv('/content/matched_all_answers_with_nifty_and_recommendation - Copy.csv')  # Replace with your actual file name

# Combine all summaries into one string
text = ' '.join(df['summary_of_all_answers'].dropna().astype(str))

# Basic cleaning
text = text.lower()
text = re.sub(r'[^a-z\s]', '', text)  # Remove punctuation and digits

# Tokenize
words = text.split()

# Remove stopwords
stop_words = set(stopwords.words('english'))
words = [word for word in words if word not in stop_words]

# Lemmatization only
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

# Word frequency
word_freq = Counter(lemmatized_words)
most_common_words = word_freq.most_common(20)

# Display
print("Most Common Lemmatized Words:\n")
for word, freq in most_common_words:
    print(f"{word}: {freq}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Most Common Lemmatized Words:

company: 937
issue: 395
share: 328
ipo: 308
price: 277
growth: 259
equity: 255
financial: 223
revenue: 212
market: 189
regulatory: 187
valuation: 178
competitive: 163
industry: 162
net: 160
fy: 158
key: 156
rate: 145
risk: 139
point: 137


In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Download resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load data
df = pd.read_csv('/content/matched_all_answers_with_nifty_and_recommendation - Copy.csv')  # Replace with your actual file name

# Preprocessing function (lemmatization only)
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)  # Remove punctuation and digits
    words = text.split()
    words = [word for word in words if word not in stop_words]
    lemmatized = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized)

# Apply preprocessing to each row
df['clean_summary'] = df['summary_of_all_answers'].fillna('').apply(preprocess)

# Compute TF-IDF across all rows (each row is one document)
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['clean_summary'])

# Get feature names and convert to DataFrame (optional)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Preview
print(tfidf_df.head())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


    ability  able  absence  absorbing  accelerated  acceptance  accepting  \
0  0.107244   0.0      0.0        0.0          0.0         0.0        0.0   
1  0.000000   0.0      0.0        0.0          0.0         0.0        0.0   
2  0.000000   0.0      0.0        0.0          0.0         0.0        0.0   
3  0.000000   0.0      0.0        0.0          0.0         0.0        0.0   
4  0.000000   0.0      0.0        0.0          0.0         0.0        0.0   

   access  accident  account  ...  withdrawing  within  woman  work  working  \
0     0.0       0.0      0.0  ...          0.0     0.0    0.0   0.0      0.0   
1     0.0       0.0      0.0  ...          0.0     0.0    0.0   0.0      0.0   
2     0.0       0.0      0.0  ...          0.0     0.0    0.0   0.0      0.0   
3     0.0       0.0      0.0  ...          0.0     0.0    0.0   0.0      0.0   
4     0.0       0.0      0.0  ...          0.0     0.0    0.0   0.0      0.0   

   worth     yarn      year  yearoveryear  yoy  
0    0.

In [None]:
# Get feature names (i.e., the vocabulary terms)
feature_names = vectorizer.get_feature_names_out()

# Function to get top n TF-IDF terms from a row
def get_top_n_tfidf_words(row, features, top_n=10):
    row_data = row.toarray().flatten()
    top_indices = row_data.argsort()[::-1][:top_n]
    return [(features[i], round(row_data[i], 4)) for i in top_indices if row_data[i] > 0]

# Apply to each row
df['top_10_tfidf'] = [
    get_top_n_tfidf_words(tfidf_matrix[i], feature_names, top_n=10)
    for i in range(tfidf_matrix.shape[0])
]

# View results
print(df[['top_10_tfidf']])


                                          top_10_tfidf
0    [(order, 0.4943), (book, 0.3858), (infrastruct...
1    [(company, 0.4032), (packaging, 0.2938), (pape...
2    [(new, 0.2567), (expanding, 0.2437), (expansio...
3    [(timbadia, 0.4042), (kartik, 0.2695), (maganl...
4    [(repayment, 0.2595), (lead, 0.2373), (dividen...
..                                                 ...
142  [(information, 0.229), (dividend, 0.1875), (tr...
143  [(company, 0.2714), (high, 0.2542), (limit, 0....
144  [(subject, 0.2196), (company, 0.218), (may, 0....
145  [(cagr, 0.3253), (fiscal, 0.3196), (compliance...
146  [(stated, 0.2158), (year, 0.2132), (company, 0...

[147 rows x 1 columns]


In [None]:
# Display top 10 TF-IDF words for first few rows
for i in range(5):  # change range for more/less rows
    print(f"\nRow {i} Summary:\n{df.loc[i]}")
    print("Top 10 TF-IDF Words:")
    for word, score in df.loc[i, 'top_10_tfidf']:
        print(f"  {word:<15} {score}")


Row 0 Summary:
Unnamed: 0                                                                 99
Name                                                   Afcons Infrastructure 
Review Title                                                    Afcons Infra 
Match Score                                                         72.727273
Listing Date                                                       04-11-2024
Issue Price                                                               463
Listing Price                                                          430.05
LTP                                                                     450.7
Returns from issue price                                                -2.66
Delta                                                               -7.116631
Target                                                                     No
answer_of_question_1                                                      NaN
answer_of_question_3        The company will uti

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

# Download VADER lexicon
nltk.download('vader_lexicon')

# Initialize sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Compute sentiment for each original summary
df['sentiment'] = df['summary_of_all_answers'].fillna('').apply(lambda x: sid.polarity_scores(x)['compound'])

# Preview
print(df[['summary_of_all_answers', 'sentiment']])

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


                                summary_of_all_answers  sentiment
0    **Key Points to Consider Before Subscribing to...     0.9438
1    Here is a summary of the key points that inves...     0.9868
2    **IPO Analysis Summary**\n\n**Key Points:**\n\...     0.6236
3    **IPO Analysis Summary**\n\n**Key Points for I...     0.4939
4    **IPO Analysis Summary**\n\n**Key Points for I...     0.6542
..                                                 ...        ...
142  **Key Points for Investors to Consider**\n\n**...     0.7351
143  **Key Points for Investors to Consider:**\n\n1...     0.9712
144  **Key Points to Consider Before Subscribing to...     0.5621
145  **IPO Analysis Summary**\n\n**Key Points:**\n\...     0.9231
146  **IPO Analysis Summary**\n\n**Key Points:**\n\...     0.8953

[147 rows x 2 columns]


In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import os
import requests
import zipfile
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')

# Function to download and load GloVe embeddings
def load_glove_embeddings(dimension=50):
    # File paths
    glove_dir = './glove'
    glove_file = f'glove.6B.{dimension}d.txt'
    glove_path = os.path.join(glove_dir, glove_file)

    # Create directory if it doesn't exist
    if not os.path.exists(glove_dir):
        os.makedirs(glove_dir)

    # Download GloVe if not already downloaded
    if not os.path.exists(glove_path):
        print("Downloading GloVe embeddings...")
        url = "http://nlp.stanford.edu/data/glove.6B.zip"
        zip_path = os.path.join(glove_dir, "glove.6B.zip")

        # Download the zip file
        response = requests.get(url, stream=True)
        with open(zip_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)

        # Extract the zip file
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(glove_dir)

        # Remove the zip file
        os.remove(zip_path)
        print("Download complete.")

    # Load the embeddings
    print(f"Loading GloVe embeddings ({dimension}d)...")
    embeddings = {}
    with open(glove_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            embeddings[word] = vector

    print(f"Loaded {len(embeddings)} word vectors.")
    return embeddings

# Read the CSV file
df = pd.read_csv('matched_all_answers_with_nifty_and_recommendation - Copy.csv')

# Initialize the lemmatizer and sentiment analyzer
lemmatizer = WordNetLemmatizer()
sia = SentimentIntensityAnalyzer()

# Get the standard English stopwords
stop_words = set(stopwords.words('english'))

# Add custom stopwords
additional_stopwords = ['company', 'issue', 'share', 'ipo', 'price']
stop_words.update(additional_stopwords)

# Function to clean and preprocess text
def preprocess_text(text):
    if pd.isna(text):
        return ""

    # Convert to lowercase
    text = text.lower()

    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords and lemmatize
    cleaned_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and len(token) > 2]

    # Join tokens back into a string
    cleaned_text = ' '.join(cleaned_tokens)

    return cleaned_text

# Function to perform sentiment analysis
def analyze_sentiment(text):
    if pd.isna(text):
        return 0

    sentiment_scores = sia.polarity_scores(text)
    return sentiment_scores['compound']

# Apply preprocessing to the summary column
df['cleaned_summary'] = df['summary_of_all_answers'].apply(preprocess_text)

# Apply sentiment analysis and create a new column
df['sentiment_score'] = df['summary_of_all_answers'].apply(analyze_sentiment)

# Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['cleaned_summary'])

# Get feature names (words)
feature_names = tfidf_vectorizer.get_feature_names_out()

# For each document (row), find the top 10 words by TF-IDF score
for i in range(10):  # Create columns for top 10 words
    df[f'word{i+1}'] = ''
    df[f'word{i+1}_val'] = 0.0

# Populate the word columns
for i, row in enumerate(tfidf_matrix):
    # Get the TF-IDF scores for this document
    row_data = row.toarray()[0]

    # Get indices of top 10 scores
    top_indices = row_data.argsort()[-10:][::-1]

    # Ensure we don't go out of bounds if there are fewer than 10 words
    num_words = min(10, len(top_indices))

    # Populate the word and score columns
    for j in range(num_words):
        if j < len(top_indices):
            word_idx = top_indices[j]
            df.at[i, f'word{j+1}'] = feature_names[word_idx]
            df.at[i, f'word{j+1}_val'] = row_data[word_idx]

# Load GloVe embeddings
glove_embeddings = load_glove_embeddings(dimension=50)

# Function to get embedding for a word
def get_embedding(word):
    # Try to get the embedding, return zeros if not found
    return glove_embeddings.get(word, np.zeros(50))

# Create columns for GloVe embeddings for each top word
for word_num in range(1, 11):  # For word1 to word10
    # For each of the 50 dimensions
    for dim in range(1, 51):
        df[f'word{word_num}_vector{dim}'] = 0.0

# Populate embedding columns
for i in range(len(df)):
    for word_num in range(1, 11):
        word_col = f'word{word_num}'
        if pd.notna(df.at[i, word_col]) and df.at[i, word_col] != '':
            word = df.at[i, word_col]
            embedding = get_embedding(word)

            # Set each dimension
            for dim in range(1, 51):
                df.at[i, f'word{word_num}_vector{dim}'] = embedding[dim-1]

# Save the results to a new CSV file
df.to_csv('processed_answers_with_tfidf_and_glove.csv', index=False)

print("Processing complete. Results saved to 'processed_answers_with_tfidf_and_glove.csv'")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Downloading GloVe embeddings...
Download complete.
Loading GloVe embeddings (50d)...
Loaded 400000 word vectors.


  df[f'word{word_num}_vector{dim}'] = 0.0
  df[f'word{word_num}_vector{dim}'] = 0.0
  df[f'word{word_num}_vector{dim}'] = 0.0
  df[f'word{word_num}_vector{dim}'] = 0.0
  df[f'word{word_num}_vector{dim}'] = 0.0
  df[f'word{word_num}_vector{dim}'] = 0.0
  df[f'word{word_num}_vector{dim}'] = 0.0
  df[f'word{word_num}_vector{dim}'] = 0.0
  df[f'word{word_num}_vector{dim}'] = 0.0
  df[f'word{word_num}_vector{dim}'] = 0.0
  df[f'word{word_num}_vector{dim}'] = 0.0
  df[f'word{word_num}_vector{dim}'] = 0.0
  df[f'word{word_num}_vector{dim}'] = 0.0
  df[f'word{word_num}_vector{dim}'] = 0.0
  df[f'word{word_num}_vector{dim}'] = 0.0
  df[f'word{word_num}_vector{dim}'] = 0.0
  df[f'word{word_num}_vector{dim}'] = 0.0
  df[f'word{word_num}_vector{dim}'] = 0.0
  df[f'word{word_num}_vector{dim}'] = 0.0
  df[f'word{word_num}_vector{dim}'] = 0.0
  df[f'word{word_num}_vector{dim}'] = 0.0
  df[f'word{word_num}_vector{dim}'] = 0.0
  df[f'word{word_num}_vector{dim}'] = 0.0
  df[f'word{word_num}_vector{dim}'

Processing complete. Results saved to 'processed_answers_with_tfidf_and_glove.csv'


In [None]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True