## import

In [None]:
import nltk
import pandas as pd
from nltk.util import ngrams
from collections import Counter
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import string

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## TF-IDF

In [None]:
# 1. 读取文本文件
with open('/content/drive/MyDrive/iss/5001/GENERAL INFORMATION.txt', 'r', encoding='utf-8') as file:
    text_data = file.readlines()

In [None]:
# 下载停用词列表（如果没有下载过的话）
nltk.download('stopwords')

# 加载停用词列表
stop_words = set(stopwords.words('english'))

# 2. 文本预处理
def preprocess_text(text):
    # 这是一个示例，你可以根据需要进行自定义的文本预处理
    text = text.lower()  # 将文本转换为小写
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # 移除非字母数字字符
    # 分词并去掉停用词
    tokens = text.split()
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)

preprocessed_data = [preprocess_text(text) for text in text_data]

# 3. 使用TfidfVectorizer进行TF-IDF特征提取
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_data)

# 4. 将TF-IDF结果存储在Pandas数据框中
tfidf_df = pd.DataFrame(data=tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# 5. 计算每个词的总TF-IDF得分，然后按照得分降序排序以查找出现频率较高的词
word_frequencies = tfidf_df.sum().sort_values(ascending=False)
# print(word_frequencies)

# 6. 输出前50个最高频率的词汇
top_50_words = word_frequencies.head(50)
print(top_50_words)

plant           19.885357
plants          13.592569
leaves          13.453856
family          10.098758
genus            9.728717
species          9.616411
known            9.129046
philodendron     8.988529
green            8.968011
native           7.915988
also             7.872250
foliage          7.442807
indoor           7.197701
tropical         7.192686
grow             7.163954
popular          6.467067
flowering        6.425223
tree             6.422277
one              6.092995
light            5.974086
red              5.880311
common           5.600970
dark             5.470250
houseplant       5.459303
name             5.256599
cactus           5.159550
long             5.154078
like             5.064711
evergreen        4.920710
ficus            4.724164
houseplants      4.705192
often            4.672859
araceae          4.623844
stems            4.572468
color            4.528370
grown            4.527553
flower           4.520414
several          4.479401
south       

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## N-grams

In [None]:
# 1. 读取文本文件
with open('/content/drive/MyDrive/iss/5001/GENERAL INFORMATION.txt', 'r', encoding='utf-8') as file:
    text = file.read()

In [None]:
nltk.download('punkt')

# 2. 文本预处理
def preprocess_text(text):
    # 将文本转换为小写
    text = text.lower()
    # 移除标点符号
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

# 3. 分词
def tokenize_and_remove_stopwords(text):
    words = nltk.word_tokenize(text)
    stopwords = set(nltk.corpus.stopwords.words('english'))
    words = [word for word in words if word not in stopwords]
    return words

preprocessed_text = preprocess_text(text)
words = tokenize_and_remove_stopwords(preprocessed_text)

# 4. 生成N-grams，并统计频率
n_values = [1, 2]  # 1-gram, 2-gram, 3-gram, and 4-gram
ngram_results = {}

for n in n_values:
    ngrams_list = list(ngrams(words, n))
    ngram_counts = Counter(ngrams_list)
    ngram_results[n] = ngram_counts

# 5. 输出每个N-gram长度的前50个最高频率的N-grams
for n in n_values:
    print(f"{n}-grams:")
    top_ngrams = ngram_results[n].most_common(50)
    for ngram, count in top_ngrams:
        print(f"{ngram}: {count}")
    print()

print(f"{1}-grams:")
top_ngrams = ngram_results[1].most_common(50)
for ngram, count in top_ngrams:
    print(f"{ngram}")
print()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


1-grams:
('plant',): 471
('leaves',): 259
('plants',): 215
('known',): 140
('family',): 130
('green',): 126
('species',): 122
('also',): 115
('native',): 114
('genus',): 112
('indoor',): 101
('foliage',): 100
('philodendron',): 90
('tropical',): 89
('grow',): 87
('light',): 79
('popular',): 77
('tree',): 64
('red',): 62
('houseplant',): 62
('flowering',): 61
('one',): 56
('common',): 56
('cactus',): 53
('dark',): 53
('soil',): 53
('grown',): 52
('name',): 51
('small',): 50
('often',): 48
('bright',): 47
('like',): 46
('stems',): 44
('long',): 44
('used',): 44
('flowers',): 43
('evergreen',): 43
('conditions',): 43
('succulent',): 43
('making',): 43
('commonly',): 42
('easy',): 42
('care',): 42
('white',): 41
('houseplants',): 41
('unique',): 41
('color',): 40
('attractive',): 40
('beautiful',): 40
('air',): 40

2-grams:
('also', 'known'): 60
('dark', 'green'): 45
('flowering', 'plants'): 29
('family', 'araceae'): 28
('flowering', 'plant'): 28
('plants', 'family'): 27
('species', 'flowe

## Merge

In [None]:
# 合并tf-idf和tf的top50单词
merged_top_words = list(top_50_words.index) + [ngram[0] for ngram, count in top_ngrams]
# 去重
merged_top_words = list(set(merged_top_words))
print(merged_top_words)
print(len(merged_top_words))

['leaves', 'plant', 'one', 'beautiful', 'genus', 'tropical', 'family', 'light', 'cactus', 'several', 'philodendron', 'soil', 'species', 'grown', 'flowers', 'foliage', 'indoors', 'care', 'small', 'houseplants', 'evergreen', 'attractive', 'color', 'popular', 'easy', 'make', 'also', 'bright', 'fern', 'indoor', 'used', 'air', 'native', 'conditions', 'south', 'unique', 'plants', 'making', 'tree', 'name', 'dark', 'commonly', 'grow', 'like', 'white', 'known', 'slowgrowing', 'green', 'houseplant', 'alocasia', 'ficus', 'flowering', 'home', 'succulent', 'long', 'often', 'caladium', 'common', 'stems', 'araceae', 'red', 'flower']
62


## Test

In [None]:
import pandas as pd
df = pd.DataFrame({'Category': ['A', 'B', 'C']})
print(df)
print("Dummy variables")
df_dummy = pd.get_dummies(df['Category'], dtype=int, drop_first=True)
print(df_dummy)

  Category
0        A
1        B
2        C
Dummy variables
   B  C
0  0  0
1  1  0
2  0  1


In [None]:
import pandas as pd
from collections import Counter

# 原始数据
data = {
    "SOIL REQUIREMENT": [
        "Suitable for light (sandy), medium (loamy) soils, prefers well-drained soil and can grow in nutritionally poor soil.",
        "Most ferns grow best in slightly acidic soils; however, maidenhair ferns prefer a more alkaline soil pH. Adding some ground limestone to the potting mix of container-grown plants or mixing it into your outdoor beds will help with this.",
        "A well-drained, lightly acidic potting soil is perfect.",
        "A well-drained, lightly acidic potting soil is perfect.",
        "A well-drained, lightly acidic potting soil is perfect.",
        "A well-drained, lightly acidic potting soil is perfect.",
        "A well-drained, lightly acidic potting soil is perfect."
    ]
}

# 创建数据帧
df = pd.DataFrame(data)

# 分词和计算词频
def calculate_word_frequencies(text):
    words = text.split()
    word_counts = Counter(words)
    return word_counts

# 选择高频单词的数量
top_words_count = 5  # 选择前5个高频单词

# 提取高频单词并创建新列
df['Top_Words'] = df['SOIL REQUIREMENT'].apply(calculate_word_frequencies).apply(lambda x: [word for word, _ in x.most_common(top_words_count)])

# 输出结果
print(df)


                                    SOIL REQUIREMENT  \
0  Suitable for light (sandy), medium (loamy) soi...   
1  Most ferns grow best in slightly acidic soils;...   
2  A well-drained, lightly acidic potting soil is...   
3  A well-drained, lightly acidic potting soil is...   
4  A well-drained, lightly acidic potting soil is...   
5  A well-drained, lightly acidic potting soil is...   
6  A well-drained, lightly acidic potting soil is...   

                                      Top_Words  
0      [Suitable, for, light, (sandy),, medium]  
1                 [ferns, Most, grow, best, in]  
2  [A, well-drained,, lightly, acidic, potting]  
3  [A, well-drained,, lightly, acidic, potting]  
4  [A, well-drained,, lightly, acidic, potting]  
5  [A, well-drained,, lightly, acidic, potting]  
6  [A, well-drained,, lightly, acidic, potting]  


In [None]:
import re

# 示例数据
data = """Plant Overall Height APPROXIMATELY:- 16cm for POT SIZE 12cm(?)
Plant Overall Height APPROXIMATELY:- 35cm for POT SIZE 15cm(?)
Plant Overall Height APPROXIMATELY:- 28cm for POT SIZE 14cm(?)
..."""

# 使用正则表达式匹配数字
heights = re.findall(r'\d+cm', data)

# 打印提取的数值信息
for height in heights:
    print(height)


16cm
12cm
35cm
15cm
28cm
14cm


## Catch Keywords

In [None]:
import pandas as pd
from collections import Counter

# 1-gram中的高频词汇

# SOIL
# top_words_1gram = ['organic', 'welldrained', 'acidic', 'welldraining',
#         'caladium', 'sandy', 'vermiculite', 'regular','medium', 'loose',
#         'mixture', 'light', 'drainage', 'moist', 'peat',
#         'moss', 'cactus', 'perlite', 'coarse',
#         'moisture', 'rich', 'matter', 'mix', 'loamy', 'compost', 'bark',
#         'porous', 'sphagnum', 'high', 'orchid', 'peatbased', 'sand']

# FERTILIZE
# top_words_1gram = ['strength', 'houseplant', 'pellets', 'month', 'feed', 'soil',
#           'halfstrength', 'slowrelease', 'aglaonema', 'balanced',
#           'watersoluble', 'two', 'use', 'houseplants', 'weekly', 'high',
#           'months', 'per', 'roots', 'summer', 'philodendron',
#           'macronutrients', 'diluted', 'regular', 'season', 'foliage', 'weeks',
#           'liquid', 'spring', 'necessary', 'fertilizer', 'food', 'monthly', 'nitrogen',
#           'recommended', 'winter', 'growth', 'three', 'water']
# LIGHT
# top_words_1gram = ['expose', 'near', 'day', 'aglaonema', 'exposure',
#           'never', 'direct', 'scorch', 'part', 'full', 'partial',
#           'position', 'hot', 'low', 'avoid', 'ideal',
#           'provide', 'varieties', 'foliage', 'filtered', 'shady',
#           'morning', 'medium', 'shaded', 'shade', 'conditions', 'much', 'tolerate',
#            'indirect', 'bright', 'indoors', 'touch', 'window', 'moderate']

# water
# top_words_1gram = ['soil', 'regularly', 'thoroughly', 'never', 'humidity', 'level',
#           'good', 'overwater', 'overwatering', 'index', 'often',
#           'week', 'avoid', 'summer', 'philodendron', 'least', 'slightly',
#           'dry', 'season', 'moisture', 'soggy', 'evenly',
#           'reduce', 'inch', 'winter', 'moist', 'little', 'moderate']
# If not specify, set 'regular'

# GENERAL
# top_words_1gram = ['leaves', 'beautiful', 'tropical', 'family', 'cactus', 'philodendron',
#           'flowers', 'foliage', 'evergreen', 'attractive', 'color', 'popular', 'easy',
#           'fern', 'tree', 'alocasia', 'ficus', 'flowering', 'succulent', 'caladium',
#           'araceae', 'flower']
top_words_1gram = ['premium', 'cactus', 'climber', 'creeper', 'fern', 'herb',
           'orchid', 'shrub', 'succulent', 'tree', 'tropical', 'flower',
           'evergreen', 'attractive', 'philodendron', 'alocasia', 'ficus', 'caladium']
# 读取原始数据文件
input_file = "/content/drive/MyDrive/iss/5001/GENERAL INFORMATION.txt"
output_file = "/content/drive/MyDrive/iss/5001/GENERAL INFORMATION_output.txt"

# 读取原始数据文件并创建数据帧
df = pd.read_csv(input_file, delimiter="\t")


In [None]:
# 数据处理：对文本进行预处理和筛选高频词汇并去重
def preprocess_and_filter(text):
    # 这是一个示例，你可以根据需要进行自定义的文本预处理和筛选
    text = text.lower()  # 将文本转换为小写
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = text.split()
    words = [word for word in words if word in top_words_1gram]
    words = list(set(words))  # 去重
    return ' '.join(words)

# 数据处理：保留包含高频词汇的部分并去重
df['Processed_Text'] = df['GENERAL INFORMATION'].apply(preprocess_and_filter)

# 将结果输出到同一文件，覆盖原始数据
df.to_csv(output_file, sep="\t", index=False)

## PRODUCT MEASUREMENT

In [None]:
# 读取原始数据文件
input_file = "/content/drive/MyDrive/iss/5001/PRODUCT MEASUREMENT.txt"
output_file = "/content/drive/MyDrive/iss/5001/PRODUCT MEASUREMENT_output.txt"

# 读取原始数据文件并创建数据帧
df = pd.read_csv(input_file, delimiter="\t")

In [None]:
# 数据处理：对文本进行预处理和筛选高频词汇并去重
def preprocess_and_filter(text):
    # 移除标点符号
    text = str(text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    # 使用正则表达式匹配数字，并将匹配到的数字连接为字符串
    numbers = ' '.join(re.findall(r'\d+cm', text))
    return numbers

df['Processed_Text'] = df['PRODUCT MEASUREMENT'].apply(preprocess_and_filter)

# 将结果输出到同一文件，覆盖原始数据
df.to_csv(output_file, sep="\t", index=False)