In [15]:
import os
import pandas as pd

# Set the working directory
os.chdir("/Users/jade/Desktop/CrystalMarket")

In [85]:
# comments that are upvoted do not necessary indicate positive comments about products being upvoted; 
# it can be something negative like this crystal should be fake or how to detect fake crystals 
# our goal for the unstructured data analysis is to find hot topics under crystal market and the topics themselves 
# can be neutral but should indicate what customers care and value the most(appearing most frequent)

# Define custom stopwords (add more as needed)
custom_stopwords = set([
    "like", "it's", "its", "one", "even", "see", "look", "would", 
    "definitely", "also", "much", "good", "maybe", "could", "make",
    "that’s", "that", "it.", "&", "thank", "really"])

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download required NLTK data
nltk.download("stopwords")
nltk.download("wordnet")

# Load standard stopwords
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

# Combine standard stopwords with custom stopwords
all_stopwords = stop_words.union(custom_stopwords)

# Function to clean text and remove unimportant words
def preprocess_text(text):
    text = re.sub(r'\d+', '', text)  # Remove numbers
    words = text.lower().split()
    words = [word for word in words if word not in all_stopwords]  # Remove stopwords
    words = [lemmatizer.lemmatize(word) for word in words]  # Lemmatization
    return words

# Apply preprocessing to dataset
df["body"] = df["body"].astype(str).apply(preprocess_text)



[nltk_data] Downloading package stopwords to /Users/jade/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/jade/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [86]:
# Tokenize all words
all_words = " ".join(df["body"].astype(str)).lower().split()

# Count word occurrences
word_counts = Counter(all_words)

# Define threshold for too common words (e.g., words appearing in > 5% of the dataset)
common_threshold = len(all_words) * 0.05  # 5% of total words

# Identify useful words (not in stopwords and not too common)
filtered_words = {word for word, count in word_counts.items() if word not in stop_words and count < common_threshold}

print(f"✅ Meaningful Words Remaining: {len(filtered_words)}")


✅ Meaningful Words Remaining: 5459


In [88]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert text into list format for TF-IDF
df["text"] = df["body"].astype(str).apply(lambda x: " ".join(x) if isinstance(x, list) else str(x))


In [89]:
# Apply TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = vectorizer.fit_transform(df["text"])

# Extract important words
selected_keywords = vectorizer.get_feature_names_out()
print(f"🔹 Selected {len(selected_keywords)} Key Market Words")
print(f"🔥 Top 20 Keywords: {selected_keywords[:20]}")

🔹 Selected 3357 Key Market Words
🔥 Top 20 Keywords: ['aaa' 'aaefdbffdafee' 'aaw' 'abalone' 'abilities' 'ability' 'able'
 'about' 'absolutely' 'absurdly' 'abundant' 'abusive' 'acc' 'accept'
 'acceptable' 'access' 'accessories' 'according' 'accumulated' 'accurate']


In [90]:
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel

# Convert cleaned text into tokenized format
df["tokenized_text"] = df["text"].apply(lambda x: x.split())

# Create dictionary and corpus
dictionary = Dictionary(df["tokenized_text"])
corpus = [dictionary.doc2bow(text) for text in df["tokenized_text"]]

# Train LDA Model
NUM_TOPICS = 5
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=NUM_TOPICS, passes=10, random_state=42)

# Print the top words in each topic
for topic_id, words in lda_model.show_topics(num_topics=NUM_TOPICS, num_words=10, formatted=False):
    print(f"\n🔹 Topic {topic_id + 1}: " + ", ".join([word for word, _ in words]))


🔹 Topic 1: 'fluorite',, 'it’s',, 'fake',, 'quartz',, 'think',, 'know',, 'people',, 'photo',, 'piece',, 'i’m',

🔹 Topic 2: '.',, 'pretty',, 'citrine',, 'amethyst',, ['look',, 'look',, 'opal',, 'may',, 'crystal',, 'piece',

🔹 Topic 3: 'crystal',, 'it’s',, 'stone',, 'fluorite',, 'use',, 'want',, 'know',, 'fake',, 'piece',, 'love',

🔹 Topic 4: 'it’s',, 'buy',, 'look',, 'don’t',, 'say',, 'sure',, 'get',, 'i’m',, 'fluorite',, 'natural',

🔹 Topic 5: 'crystal',, 'know',, 'piece',, 'feel',, 'it’s',, 'different',, 'get',, 'want',, 'sure',, ['it’s',


In [99]:
df=pd.read_csv("cc.csv")

In [100]:
# Define an expanded custom stopword list
custom_stopwords = set([
    "like", "it's", "its", "one", "even", "see", "look", "would", "definitely", 
    "also", "much", "good", "maybe", "could", "make", "that’s", "that", "it.", "&", 
    "thank", "really","think", "know", "people", 
    "photo", "piece", "i’m", "pretty", "citrine", "amethyst", "opal", "may", "stone",
    "use", "want", "buy", "don’t", "say", "sure", "get", "feel", "different","it's",
    "crystal","look",".","bc","u","$"
])


In [101]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download stopwords if not already available
nltk.download("stopwords")
nltk.download("wordnet")

# Load NLTK stopwords and merge with custom stopwords
stop_words = set(stopwords.words("english")).union(custom_stopwords)
lemmatizer = WordNetLemmatizer()

# Function to clean text and filter out unimportant words
def preprocess_text(text):
    text = re.sub(r'\d+', '', text)  # Remove numbers
    words = text.lower().split()
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    words = [lemmatizer.lemmatize(word) for word in words]  # Lemmatization
    return words

# Apply the updated preprocessing function
df["cleaned_text"] = df["body"].astype(str).apply(preprocess_text)


[nltk_data] Downloading package stopwords to /Users/jade/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/jade/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [103]:
# Apply TF-IDF
df["text"] = df["cleaned_text"].astype(str).apply(lambda x: " ".join(x) if isinstance(x, list) else str(x))
vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = vectorizer.fit_transform(df["text"])

# Extract important words
selected_keywords = vectorizer.get_feature_names_out()
print(f"🔹 Selected {len(selected_keywords)} Key Market Words")
print(f"🔥 Top 20 Keywords: {selected_keywords[:20]}")

🔹 Selected 3459 Key Market Words
🔥 Top 20 Keywords: ['aaa' 'aaefdbffdafee' 'aaw' 'abalone' 'abilities' 'ability' 'able'
 'about' 'absolutely' 'absorb' 'absorbing' 'absurdly' 'abundant' 'abuse'
 'abusive' 'acc' 'accept' 'acceptable' 'access' 'accessories']


In [104]:
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel

# Convert cleaned text into dictionary and corpus format for LDA
dictionary = Dictionary(df["text"])
corpus = [dictionary.doc2bow(text) for text in df["text"]]

# Train LDA Model
NUM_TOPICS = 5
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=NUM_TOPICS, passes=10, random_state=42)

# Print the top words in each topic
for topic_id, words in lda_model.show_topics(num_topics=NUM_TOPICS, num_words=10, formatted=False):
    print(f"\n🔹 Topic {topic_id + 1}: " + ", ".join([word for word, _ in words]))


TypeError: doc2bow expects an array of unicode tokens on input, not a single string

In [105]:
import pandas as pd
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
import nltk
from nltk.corpus import stopwords

# Download stopwords if not available
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

# Load dataset
df = pd.read_csv("cc.csv")

# Tokenize all words and count occurrences
all_words = " ".join(df["body"].astype(str)).lower().split()
word_counts = Counter(all_words)

# Define threshold for too common words (5% of total words)
common_threshold = len(all_words) * 0.05

# Identify meaningful words (not stopwords & not overly common)
filtered_words = {word for word, count in word_counts.items() if word not in stop_words and count < common_threshold}

print(f"✅ Meaningful Words Remaining: {len(filtered_words)}")



[nltk_data] Downloading package stopwords to /Users/jade/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


✅ Meaningful Words Remaining: 5251


In [None]:
# Convert text column to lists of tokens (tokenized format)
df["tokenized_text"] = df["body"].astype(str).apply(lambda x: [word for word in x.lower().split() if word in filtered_words])

# Convert lists to strings for TF-IDF processing
df["text"] = df["tokenized_text"].apply(lambda x: " ".join(x))

# Apply TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = vectorizer.fit_transform(df["text"])

# Extract important words
selected_keywords = vectorizer.get_feature_names_out()
print(f"🔹 Selected {len(selected_keywords)} Key Market Words")
print(f"🔥 Top 20 Keywords: {selected_keywords[:20]}")


In [None]:
# Convert tokenized text into dictionary and corpus format for LDA
dictionary = Dictionary(df["tokenized_text"])  # ✅ Ensure tokenized lists are passed
corpus = [dictionary.doc2bow(text) for text in df["tokenized_text"]]

# Train LDA Model
NUM_TOPICS = 5
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=NUM_TOPICS, passes=10, random_state=42)

# Print the top words in each topic
for topic_id, words in lda_model.show_topics(num_topics=NUM_TOPICS, num_words=10, formatted=False):
    print(f"\n🔹 Topic {topic_id + 1}: " + ", ".join([word for word, _ in words]))

In [114]:
import pandas as pd
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
import nltk
from nltk.corpus import stopwords
import re

# Download stopwords if not available
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

# Define an expanded custom stopword list
custom_stopwords = set([
    "like", "it’s", "its", "one", "even", "see", "look", "would", "definitely", 
    "also", "much", "good", "maybe", "could", "make", "that’s", "that", "it.", "&", 
    "thank", "really", "think", "know", "people", "photo", "piece", "i’m", "pretty", 
     "may", "stone", "use", "want", "buy", "don’t", 
    "say", "sure", "get", "feel", "different", "it's", "crystal", "look", ".", "bc", 
    "u", "$"," -", "😍","might","thought","never","time","still","but","cotton"
])

# Merge standard stopwords and custom stopwords
all_stopwords = stop_words.union(custom_stopwords)

# Load dataset
df = pd.read_csv("cc.csv")


[nltk_data] Downloading package stopwords to /Users/jade/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [115]:
# Tokenize all words and count occurrences
all_words = " ".join(df["body"].astype(str)).lower().split()
word_counts = Counter(all_words)

# Define threshold for too common words (5% of total words)
common_threshold = len(all_words) * 0.05

# Identify meaningful words (not stopwords & not overly common)
filtered_words = {word for word, count in word_counts.items() if word not in all_stopwords and count < common_threshold}

print(f"✅ Meaningful Words Remaining: {len(filtered_words)}")

✅ Meaningful Words Remaining: 5203


In [116]:
# Text preprocessing function
def preprocess_text(text):
    text = re.sub(r'\d+', '', text)  # Remove numbers
    words = text.lower().split()
    words = [word for word in words if word in filtered_words]  # Keep only meaningful words
    return words

# Apply text preprocessing
df["tokenized_text"] = df["body"].astype(str).apply(preprocess_text)

# Convert tokenized text back to strings for TF-IDF
df["text"] = df["tokenized_text"].apply(lambda x: " ".join(x))

# Apply TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = vectorizer.fit_transform(df["text"])

# Extract important words
selected_keywords = vectorizer.get_feature_names_out()
print(f"🔹 Selected {len(selected_keywords)} Key Market Words")
print(f"🔥 Top 20 Keywords: {selected_keywords[:20]}")

# Convert tokenized text into dictionary and corpus format for LDA
dictionary = Dictionary(df["tokenized_text"])  # ✅ Ensure tokenized lists are passed
corpus = [dictionary.doc2bow(text) for text in df["tokenized_text"]]

# Train LDA Model
NUM_TOPICS = 5
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=NUM_TOPICS, passes=10, random_state=42)

# Print the top words in each topic
for topic_id, words in lda_model.show_topics(num_topics=NUM_TOPICS, num_words=10, formatted=False):
    print(f"\n🔹 Topic {topic_id + 1}: " + ", ".join([word for word, _ in words]))

🔹 Selected 3422 Key Market Words
🔥 Top 20 Keywords: ['aaa' 'aaw' 'abalone' 'abilities' 'ability' 'able' 'about' 'absolutely'
 'absorb' 'absorbing' 'absurdly' 'abundant' 'abuse' 'abusive' 'acc'
 'accept' 'acceptable' 'access' 'accessories' 'according']

🔹 Topic 1: looks, fluorite, natural, color, made, crystals, quality, love, fake, come

🔹 Topic 2: citrine, crystals, fake, fluorite, quartz, love, new, rose, that's, real

🔹 Topic 3: looks, yes, put, take, beautiful!, me., light, that's, things, going

🔹 Topic 4: looks, quartz, agate, crystals, fluorite, green, real, moss, thanks, almost

🔹 Topic 5: amethyst, beautiful, opal, tell, love, lot, kind, heat, opalite, you!
