In [5]:
import re
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Load new tweets
new_tweets = [
    "Just saw the most amazing movie! Highly recommend it to everyone.",
    "I can't believe the weather today—it's raining cats and dogs!",
    "Feeling so grateful for my friends and family. Life is good.",
    "This new phone I bought is such a disappointment. Wish I had researched more.",
    "Had a fantastic workout session at the gym today. Feeling energized!"
]

# Function for text cleaning, tokenization, and normalization
def clean_text(text):
    text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove special characters and punctuation
    text = re.sub(r'@user', '', text)  # Remove mentions of @user
    tokens = word_tokenize(text)  # Tokenize text
    stop_words = set(stopwords.words('english'))
    tokens = [word.lower() for word in tokens if word.lower() not in stop_words]  # Remove stopwords
    return ' '.join(tokens)

# Preprocess new tweets
preprocessed_tweets = [clean_text(tweet) for tweet in new_tweets]

# Load the saved TF-IDF vectorizer
with open('vectorizer.pkl', 'rb') as f:
    tfidf_vectorizer = pickle.load(f)

# Transform new tweets using the loaded vectorizer
new_tweets_tfidf = tfidf_vectorizer.transform(preprocessed_tweets)

# Load the saved SVD model (TruncatedSVD for dimensionality reduction)
with open('svd_model.pkl', 'rb') as f:
    svd = pickle.load(f)

# Apply the SVD transformation to the new data (reduce dimensionality)
new_tweets_reduced = svd.transform(new_tweets_tfidf)

# Load the pre-trained KMeans model
with open('kmeans_model.pkl', 'rb') as f:
    kmeans = pickle.load(f)

# Check the type of loaded model
print(type(kmeans))  # Ensure this is <class 'sklearn.cluster._kmeans.KMeans'>

# Predict the clusters for the new data
kmeans_labels = kmeans.predict(new_tweets_reduced)

# Create a DataFrame to store results
df_results = pd.DataFrame({
    'Tweet': new_tweets,
    'KMeans_Label': kmeans_labels
})

# Display results
print(df_results)

# Save the new KMeans cluster labels for further use
with open('new_kmeans_labels.pkl', 'wb') as f:
    pickle.dump(kmeans_labels, f)

# Optionally, save results to a CSV file
df_results.to_csv('tweets_with_kmeans_labels.csv', index=False)


https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


<class 'sklearn.cluster._kmeans.KMeans'>


ValueError: X has 100 features, but KMeans is expecting 28645 features as input.

In [6]:
import pickle
from sklearn.cluster import KMeans

with open('kmeans_model.pkl', 'rb') as f:
    kmeans = pickle.load(f)

print(type(kmeans))  # Should output: <class 'sklearn.cluster._kmeans.KMeans'>


<class 'sklearn.cluster._kmeans.KMeans'>


In [7]:
import pickle
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

# Load the saved models
with open('kmeans_model.pkl', 'rb') as f:
    kmeans = pickle.load(f)
with open('svd_model.pkl', 'rb') as f:
    svd = pickle.load(f)
with open('vectorizer.pkl', 'rb') as f:
    tfidf_vectorizer = pickle.load(f)

# Function for text cleaning, tokenization, and normalization
def clean_text(text):
    text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove special characters and punctuation
    text = re.sub(r'@user', '', text)  # Remove mentions of @user
    tokens = word_tokenize(text)  # Tokenize text
    stop_words = set(stopwords.words('english'))
    tokens = [word.lower() for word in tokens if word.lower() not in stop_words]  # Remove stopwords
    return ' '.join(tokens)

# Load and preprocess new tweets
new_tweets = [
    "Just saw the most amazing movie! Highly recommend it to everyone.",
    "I can't believe the weather today—it's raining cats and dogs!",
    "Feeling so grateful for my friends and family. Life is good.",
    "This new phone I bought is such a disappointment. Wish I had researched more.",
    "Had a fantastic workout session at the gym today. Feeling energized!"
]
preprocessed_tweets = [clean_text(tweet) for tweet in new_tweets]

# Transform new tweets using the saved TF-IDF vectorizer
new_tweets_tfidf = tfidf_vectorizer.transform(preprocessed_tweets)

# Apply the SVD transformation to the new data (reduce dimensionality)
new_tweets_reduced = svd.transform(new_tweets_tfidf)

# Predict the clusters for the new data
kmeans_labels = kmeans.predict(new_tweets_reduced)

# Create a DataFrame to store results
df_results = pd.DataFrame({
    'Tweet': new_tweets,
    'KMeans_Label': kmeans_labels
})

# Display results
print(df_results)

# Save the new KMeans cluster labels for further use
with open('new_kmeans_labels.pkl', 'wb') as f:
    pickle.dump(kmeans_labels, f)

# Optionally, save results to a CSV file
df_results.to_csv('tweets_with_kmeans_labels.csv', index=False)


https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


ValueError: X has 100 features, but KMeans is expecting 28645 features as input.

In [9]:
import pickle
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

# Load the saved models
with open('kmeans_model.pkl', 'rb') as f:
    kmeans = pickle.load(f)
with open('svd_model.pkl', 'rb') as f:
    svd = pickle.load(f)
with open('vectorizer.pkl', 'rb') as f:
    tfidf_vectorizer = pickle.load(f)
    
# Function for text cleaning, tokenization, and normalization
def clean_text(text):
    text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove special characters and punctuation
    text = re.sub(r'@user', '', text)  # Remove mentions of @user
    tokens = word_tokenize(text)  # Tokenize text
    stop_words = set(stopwords.words('english'))
    tokens = [word.lower() for word in tokens if word.lower() not in stop_words]  # Remove stopwords
    return ' '.join(tokens)

# Load and preprocess new tweets
new_tweets = [
    "Just saw the most amazing movie! Highly recommend it to everyone.",
    "I can't believe the weather today—it's raining cats and dogs!",
    "Feeling so grateful for my friends and family. Life is good.",
    "This new phone I bought is such a disappointment. Wish I had researched more.",
    "Had a fantastic workout session at the gym today. Feeling energized!"
]
preprocessed_tweets = [clean_text(tweet) for tweet in new_tweets]

# Transform new tweets using the saved TF-IDF vectorizer
new_tweets_tfidf = tfidf_vectorizer.transform(preprocessed_tweets)

# Apply the SVD transformation to the new data
new_tweets_reduced = svd.transform(new_tweets_tfidf)

# Predict the clusters for the new data
kmeans_labels = kmeans.predict(new_tweets_reduced)

# Create a DataFrame to store results
df_results = pd.DataFrame({
    'Tweet': new_tweets,
    'KMeans_Label': kmeans_labels
})

# Display results
print(df_results)

# Save the new KMeans cluster labels for further use
with open('new_kmeans_labels.pkl', 'wb') as f:
    pickle.dump(kmeans_labels, f)

# Optionally, save results to a CSV file
df_results.to_csv('tweets_with_kmeans_labels.csv', index=False)



https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


                                               Tweet  KMeans_Label
0  Just saw the most amazing movie! Highly recomm...             2
1  I can't believe the weather today—it's raining...             2
2  Feeling so grateful for my friends and family....             0
3  This new phone I bought is such a disappointme...             2
4  Had a fantastic workout session at the gym tod...             2
