In [1]:
import pandas as pd

# Load the dataset
df = pd.read_excel('sample_customer_database_5000_singapore.xlsx')


In [2]:
# First look at the data
print(df.head())  # First 5 rows

  Customer ID        Full Name                Email Address  Phone Number  \
0       C0001     Norma Fisher          ysullivan@yahoo.com      82421948   
1       C0002      Levi Durham            qgrimes@gmail.com      97535139   
2       C0003   Kimberly Olsen  sean96@johnston-roberts.com      71122018   
3       C0004   Matthew Davies    nguyendarrell@hotmail.com      41352560   
4       C0005  Angela Martinez    myersmitchell@johnson.com        869141   

  Date Joined     Location  Gender Loyalty Tier  \
0  2023-08-11     Tampines  Female     Platinum   
1  2022-11-24      Geylang  Female     Platinum   
2  2023-06-19     Tampines  Female     Platinum   
3  2025-04-04   Ang Mo Kio    Male       Silver   
4  2025-01-15  Bukit Batok  Female     Platinum   

                                               Notes  
0                        Together range line beyond.  
1  Language ball floor meet usually board necessary.  
2                 Support time operation wear often.  
3         

In [3]:
# Shape of the dataset
print("Shape of dataset:", df.shape)

Shape of dataset: (5000, 9)


In [4]:
# Columns and Data types
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Customer ID    5000 non-null   object        
 1   Full Name      5000 non-null   object        
 2   Email Address  5000 non-null   object        
 3   Phone Number   5000 non-null   int64         
 4   Date Joined    5000 non-null   datetime64[ns]
 5   Location       5000 non-null   object        
 6   Gender         5000 non-null   object        
 7   Loyalty Tier   5000 non-null   object        
 8   Notes          5000 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(7)
memory usage: 351.7+ KB
None


In [5]:
# Check for missing values
print("Missing values per column:\n", df.isnull().sum())

Missing values per column:
 Customer ID      0
Full Name        0
Email Address    0
Phone Number     0
Date Joined      0
Location         0
Gender           0
Loyalty Tier     0
Notes            0
dtype: int64


In [6]:
# Check for duplicate rows
print("Number of duplicate rows:", df.duplicated().sum())

Number of duplicate rows: 0


In [7]:
# Unique values per column
print("Unique values per column:\n", df.nunique())

Unique values per column:
 Customer ID      5000
Full Name        4835
Email Address    4983
Phone Number     4998
Date Joined      1084
Location           27
Gender              2
Loyalty Tier        3
Notes            5000
dtype: int64


In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Preprocess the text data: Lowercase and clean text
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])  # Remove punctuation and non-alphabetical characters
    return text

# Apply preprocessing to 'Notes' column
df['Notes'] = df['Notes'].apply(preprocess_text)

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)  # Limit to top 1000 features to avoid high-dimensional data

# Apply TF-IDF to the Notes column
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Notes'])

# Convert the result to a DataFrame for easier viewing
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Show the result (first 5 rows)
print(tfidf_df.head())

   ability  able  accept  according  account  act  action  activity  actually  \
0      0.0   0.0     0.0        0.0      0.0  0.0     0.0       0.0       0.0   
1      0.0   0.0     0.0        0.0      0.0  0.0     0.0       0.0       0.0   
2      0.0   0.0     0.0        0.0      0.0  0.0     0.0       0.0       0.0   
3      0.0   0.0     0.0        0.0      0.0  0.0     0.0       0.0       0.0   
4      0.0   0.0     0.0        0.0      0.0  0.0     0.0       0.0       0.0   

   add  ...  world  worry  write  writer  wrong  yard  yeah  year  yes  young  
0  0.0  ...    0.0    0.0    0.0     0.0    0.0   0.0   0.0   0.0  0.0    0.0  
1  0.0  ...    0.0    0.0    0.0     0.0    0.0   0.0   0.0   0.0  0.0    0.0  
2  0.0  ...    0.0    0.0    0.0     0.0    0.0   0.0   0.0   0.0  0.0    0.0  
3  0.0  ...    0.0    0.0    0.0     0.0    0.0   0.0   0.0   0.0  0.0    0.0  
4  0.0  ...    0.0    0.0    0.0     0.0    0.0   0.0   0.0   0.0  0.0    0.0  

[5 rows x 763 columns]


In [9]:
pip install gensim


Collecting gensim
  Downloading gensim-4.3.3-cp310-cp310-win_amd64.whl.metadata (8.2 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp310-cp310-win_amd64.whl.metadata (61 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp310-cp310-win_amd64.whl.metadata (60 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Downloading smart_open-7.1.0-py3-none-any.whl.metadata (24 kB)
Collecting wrapt (from smart-open>=1.8.1->gensim)
  Downloading wrapt-1.17.2-cp310-cp310-win_amd64.whl.metadata (6.5 kB)
Downloading gensim-4.3.3-cp310-cp310-win_amd64.whl (24.0 MB)
   ---------------------------------------- 0.0/24.0 MB ? eta -:--:--
   --- ------------------------------------ 2.1/24.0 MB 10.7 MB/s eta 0:00:03
   ------ --------------------------------- 4.2/24.0 MB 10.1 MB/s eta 0:00:02
   ----------- ---------------------------- 6.8/24.0 MB 11.0 MB/s eta 0:00:02
   ----------------- ---------------------- 10.2/24.0 MB 12.0 MB/s eta 0:00:02
   -----

  You can safely remove it manually.
  You can safely remove it manually.
  You can safely remove it manually.
  You can safely remove it manually.


In [10]:
df['Notes'] = df['Notes'].fillna('')


In [11]:
import pandas as pd
import nltk
from gensim.models import Word2Vec
import numpy as np

# Ensure you have the necessary NLTK data
nltk.download('punkt')

# Sample dataframe (replace with actual DataFrame)
# df = pd.read_csv('your_file.csv')

# Preprocess the text data (lowercase, remove punctuation)
def preprocess_text(text):
    if pd.isna(text):
        return ''  # Return empty string if the text is NaN
    text = text.lower()  # Convert to lowercase
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])  # Remove punctuation and special characters
    return text

# Apply preprocessing to 'Notes' column
df['Notes'] = df['Notes'].apply(preprocess_text)

# Tokenize the 'Notes' column and check for any empty lists
df['Tokens'] = df['Notes'].apply(lambda x: nltk.word_tokenize(x))

# Remove rows where 'Tokens' is an empty list
df = df[df['Tokens'].apply(len) > 0]

# Create the Word2Vec model using the tokens from the 'Notes' column
# We will use 'skip-gram' model (default) with 100 dimensions and a window size of 5 words
model = Word2Vec(df['Tokens'], vector_size=100, window=5, min_count=1, workers=4)

# Training Word2Vec model (you can save it for future use)
model.save("word2vec_model.model")

# Example: Getting the vector for a word (e.g., 'loyalty')
vector = model.wv['loyalty']  # Replace 'loyalty' with any word in your dataset
print("Word vector for 'loyalty':", vector)
print("Vector shape:", vector.shape)

# Function to get the average Word2Vec vector for a list of tokens (e.g., each 'Note')
def get_average_word2vec(tokens_list, model, vector_size=100):
    # Initialize an empty vector for the average word vector
    if len(tokens_list) == 0:
        return [0] * vector_size
    else:
        vec = np.zeros(vector_size)
        valid_words = 0
        for word in tokens_list:
            if word in model.wv:
                vec = np.add(vec, model.wv[word])
                valid_words += 1
        if valid_words > 0:
            vec = vec / valid_words  # Average the vectors
        return vec

# Get average Word2Vec embeddings for all 'Notes'
df['Word2Vec'] = df['Tokens'].apply(lambda x: get_average_word2vec(x, model))

# The 'Word2Vec' column now contains the embedding for each customer’s notes
print(df[['Full Name', 'Word2Vec']].head())


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:
import nltk

# Set the directory for NLTK data
nltk.data.path.append(r'C:\nltk_data')  # Change to your NLTK data directory

# Try downloading the punkt tokenizer again
nltk.download('punkt')


In [None]:
import pandas as pd
import re

# Load dataset
df = pd.read_excel("sample_customer_database_5000_singapore.xlsx")

# Fill missing notes with empty string
df['Notes'] = df['Notes'].fillna('')

# Preprocess text: lowercase, remove punctuation and digits
def preprocess_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # remove everything except letters and whitespace
    return text.strip()

# Apply preprocessing
df['Cleaned_Notes'] = df['Notes'].apply(preprocess_text)

# Show sample
print(df[['Notes', 'Cleaned_Notes']].head())


In [None]:
import nltk

# Force download into the default directory
nltk.download('punkt', download_dir='C:/Users/Lenovo/AppData/Roaming/nltk_data')
nltk.download('stopwords', download_dir='C:/Users/Lenovo/AppData/Roaming/nltk_data')

# (optional) Re-add the path explicitly
nltk.data.path.append("C:/Users/Lenovo/AppData/Roaming/nltk_data")


In [None]:
import nltk
print(nltk.data.path)



In [None]:
import nltk

# Force download into the default directory
nltk.download('punkt', download_dir='C:/Users/Lenovo/AppData/Roaming/nltk_data')
nltk.download('stopwords', download_dir='C:/Users/Lenovo/AppData/Roaming/nltk_data')

# (optional) Re-add the path explicitly
nltk.data.path.append("C:/Users/Lenovo/AppData/Roaming/nltk_data")


In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load stopwords
stop_words = set(stopwords.words('english'))

# Preprocessing function
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation and non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Join back to string
    return ' '.join(filtered_tokens)

# Apply to Notes column
df['Cleaned_Notes'] = df['Notes'].astype(str).apply(preprocess_text)

# View results
print(df[['Notes', 'Cleaned_Notes']].head())


In [None]:
import nltk
nltk.download('punkt')  # Re-download the correct tokenizer


In [None]:
nltk.download('punkt', download_dir='C:/Users/Lenovo/AppData/Roaming/nltk_data')
nltk.data.path.append('C:/Users/Lenovo/AppData/Roaming/nltk_data')


In [None]:
from nltk.tokenize import word_tokenize
import nltk

tokens = word_tokenize("This is a test sentence.")
print(tokens)


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\Lenovo/nltk_data'
    - 'c:\\Users\\Lenovo\\anaconda3\\nltk_data'
    - 'c:\\Users\\Lenovo\\anaconda3\\share\\nltk_data'
    - 'c:\\Users\\Lenovo\\anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\Lenovo\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [None]:
import pandas as pd
import re

# Load dataset
df = pd.read_excel("sample_customer_database_5000_singapore.xlsx")

# Fill missing notes with empty string
df['Notes'] = df['Notes'].fillna('')

# Preprocess text: lowercase, remove punctuation and digits
def preprocess_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # remove everything except letters and whitespace
    return text.strip()

# Apply preprocessing
df['Cleaned_Notes'] = df['Notes'].apply(preprocess_text)

# Show sample
print(df[['Notes', 'Cleaned_Notes']].head())


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize Cleaned_Notes
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['Cleaned_Notes'])

# Check shape
print("TF-IDF matrix shape:", tfidf_matrix.shape)


In [None]:
from sklearn.cluster import KMeans

# Number of clusters
num_clusters = 5

# Apply KMeans clustering to the TF-IDF matrix
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
labels = kmeans.fit_predict(tfidf_matrix)

# Assign cluster labels to the DataFrame
df['Text_Cluster_Label'] = labels

# View results
print(df[['Cleaned_Notes', 'Text_Cluster_Label']].head())
