## Step 1: Import Required Libraries

In [1]:
# Import essential libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk

# Download NLTK datasets (if not already done)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abhil\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abhil\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abhil\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Step 2: Create a Synthetic Dataset

In [2]:
# Create synthetic data for customer segmentation
customer_data = pd.DataFrame({
    'CustomerID': [1, 2, 3, 4, 5],
    'PurchaseAmount': [200, 150, 300, 400, 100],
    'PurchaseFrequency': [3, 5, 2, 4, 6]
})

# Create synthetic text data for sentiment analysis
text_data = pd.DataFrame({
    'Review': [
        "I love the product, it's amazing!",
        "Terrible experience, will never buy again.",
        "Decent quality but not worth the price.",
        "Excellent service and great value.",
        "Worst product I have ever used."
    ],
    'Sentiment': [1, 0, 0, 1, 0]  # 1 = Positive, 0 = Negative
})


## Step 3: K-Means Clustering for Customer Segmentation

In [3]:
# Prepare data for clustering
features = customer_data[['PurchaseAmount', 'PurchaseFrequency']]

# Apply K-Means Clustering
kmeans = KMeans(n_clusters=2, random_state=42)
customer_data['Cluster'] = kmeans.fit_predict(features)

# Evaluate clustering performance
silhouette_avg = silhouette_score(features, customer_data['Cluster'])
print(f"Silhouette Score: {silhouette_avg}")

# Display clusters
print(customer_data)


Silhouette Score: 0.5765558837701512
   CustomerID  PurchaseAmount  PurchaseFrequency  Cluster
0           1             200                  3        0
1           2             150                  5        0
2           3             300                  2        1
3           4             400                  4        1
4           5             100                  6        0


## Step 4: Text Preprocessing for Sentiment Analysis

In [4]:
# Define text preprocessing function
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    
    # Tokenization
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing to text data
text_data['Processed_Review'] = text_data['Review'].apply(preprocess_text)
print(text_data[['Review', 'Processed_Review']])


                                       Review               Processed_Review
0           I love the product, it's amazing!           love product amazing
1  Terrible experience, will never buy again.  terrible experience never buy
2     Decent quality but not worth the price.     decent quality worth price
3          Excellent service and great value.  excellent service great value
4             Worst product I have ever used.        worst product ever used


## Step 5: Feature Engineering using TF-IDF

In [5]:
# Convert text data to TF-IDF features
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(text_data['Processed_Review'])

# Extract labels
y = text_data['Sentiment']


## Step 6: Train-Test Split

In [6]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Step 7: Build and Evaluate Sentiment Analysis Model

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Train Logistic Regression Model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))


Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       1.00      1.00      1.00         1
weighted avg       1.00      1.00      1.00         1

