In [None]:
!pip install opencc

import numpy as np
import pandas as pd
from opencc import OpenCC
import random

In [None]:
file_path ="Hotel2000.xlsx"
df = pd.read_excel(file_path, header=None)
cc = OpenCC('t2s')
# 定義轉換函數
def convert_to_simplified(text):
    return cc.convert(text)
# 將 0 列中的所有繁體字轉換成簡體字
df[0] = df[0].apply(convert_to_simplified)

In [None]:
import jieba

In [None]:
tokenized_text = []
for text in df[0]:
    tokens = jieba.cut(text)
    tokenized_text.append(" ".join(tokens))

# 将分词后的文本存储在一个新的列中
df['tokenized'] = tokenized_text

df['tokenized'] 

In [None]:
# 读取停用词文件，将停用词存储在一个集合中
stopwords_path = 'baidu_stopwords.txt'
with open(stopwords_path, 'r', encoding='utf-8') as f:
    stopwords = set(f.read().splitlines())

# 对 df['tokenized'] 列中的文本进行迭代，并删除其中的停用词
cleaned_text = []
for text in df['tokenized']:
    tokens = text.split()
    cleaned_tokens = [token for token in tokens if token not in stopwords]
    cleaned_text.append(" ".join(cleaned_tokens))

# 将删除停用词后的文本存储在一个新的列中
df['cleaned'] = cleaned_text

In [None]:
df

In [None]:
user_dict_path = 'output.txt'
jieba.load_userdict(user_dict_path)

# 使用自定义词典对 df['cleaned'] 列中的文本进行分词
custom_tokenized_data = []
for text in df['cleaned']:
    tokens = jieba.cut(text)
    custom_tokenized_data.append(" ".join(tokens))

# 将使用自定义词典分词后的文本存储在一个新的列中
df['custom_tokenized'] = custom_tokenized_data


In [None]:
import unicodedata
cleaned_tokens_without_punctuation_and_digits = []

for text in df['custom_tokenized']:
    # 使用unicodedata库去除所有标点符号和数字
    cleaned_text = ''.join([char for char in text if not unicodedata.category(char).startswith('P') and not char.isdigit()])
    cleaned_tokens_without_punctuation_and_digits.append(cleaned_text)

# 将去除标点符号和数字后的文本存储在一个新的列中
df['final_cleaned'] = cleaned_tokens_without_punctuation_and_digits
df['final_cleaned']

In [None]:
cleaned_tokens_without_uppercase = []

for text in df['final_cleaned']:
    # 去除大写英文字母
    cleaned_text = ''.join([char for char in text if not char.isupper()])
    cleaned_tokens_without_uppercase.append(cleaned_text)

# 将去除大写英文字母后的文本存储在一个新的列中
df['final_cleaned_no_upper'] = cleaned_tokens_without_uppercase
df['final_cleaned_no_upper']

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['final_cleaned_no_upper'])

# 使用df[5]作为目标值
y = df[5]

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 使用SVM进行训练
svm_classifier = SVC()
svm_classifier.fit(X_train, y_train)

# 预测
y_pred = svm_classifier.predict(X_test)

# 输出分类报告
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['final_cleaned_no_upper'])

# 使用df[5]作为目标值
y = df[5]

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 使用随机森林进行训练
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)

# 预测
y_pred = rf_classifier.predict(X_test)

# 输出分类报告
print(classification_report(y_test, y_pred))


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['final_cleaned_no_upper'])

# 使用df[5]作为目标值
y = df[5]

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 使用逻辑回归进行训练
lr_classifier = LogisticRegression()
lr_classifier.fit(X_train, y_train)

# 预测
y_pred = lr_classifier.predict(X_test)

# 输出分类报告
print(classification_report(y_test, y_pred))


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['final_cleaned_no_upper'])

# 使用df[5]作为目标值
y = df[5]

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 使用决策树进行训练
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train, y_train)

# 预测
y_pred = dt_classifier.predict(X_test)

# 输出分类报告
print(classification_report(y_test, y_pred))


In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


# 参数设置
max_features = 2000
embed_dim = 128
lstm_out = 196
batch_size = 32
epochs = 10

# Tokenizer初始化
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(df['final_cleaned_no_upper'].values)

# 将文本转换为数字序列，并填充序列以获得相同长度
X = tokenizer.texts_to_sequences(df['final_cleaned_no_upper'].values)
X = pad_sequences(X)

# 使用df[5]作为目标值，并将其转换为分类数据
y = pd.get_dummies(df[5]).values

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 创建LSTM模型
model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

# 训练模型
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=1)



In [None]:
# 预测
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test, axis=1)

# 输出分类报告
print(classification_report(y_test_classes, y_pred_classes))


In [None]:
from sklearn.metrics import precision_recall_fscore_support

# 计算 precision, recall, F1-score 和支持度
precision, recall, f1_score, support = precision_recall_fscore_support(y_test_classes, y_pred_classes, average='weighted')

# 打印结果
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-score: {f1_score:.4f}')


In [None]:
pip install transformers torch sklearn


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m51.8 MB/s[0m eta [36m0:00:00[0m
Collecting sklearn
  Downloading sklearn-0.0.post4.tar.gz (3.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m82.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.

In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import GPT2ForSequenceClassification, GPT2Tokenizer, GPT2Config
from transformers import Trainer, TrainingArguments
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
from torch.utils.data import Dataset
from transformers import BertTokenizer
from transformers import TrainingArguments
# 读取数据
df = pd.read_excel("Hotel2000.xlsx", header=None)

# 定义自定义数据集类型
class TextClassificationDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': self.labels[idx],
        }

# 读取文本和标签数据
texts = df[0].tolist()
labels = df[5].tolist()

# 定义模型和tokenizer
model = GPT2ForSequenceClassification.from_pretrained("ckiplab/gpt2-base-chinese", num_labels=len(set(labels)))
tokenizer = BertTokenizer.from_pretrained("ckiplab/gpt2-base-chinese", pad_token="[PAD]")

# 对文本进行编码
input_encodings = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
input_ids = input_encodings["input_ids"]
attention_mask = input_encodings["attention_mask"]

# 将标签转换为Tensor
labels = torch.tensor(labels)

# 划分训练集和测试集
train_inputs, test_inputs, train_labels, test_labels = train_test_split(input_ids, labels, random_state=42, test_size=0.3)
train_masks, test_masks, _, _ = train_test_split(attention_mask, labels, random_state=42, test_size=0.3)

# 配置模型和训练参数
config = GPT2Config.from_pretrained("ckiplab/gpt2-base-chinese", num_labels=len(set(labels)))
model = GPT2ForSequenceClassification.from_pretrained("ckiplab/gpt2-base-chinese", config=config)
model.init_weights()
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=1,  # 将训练批次大小设置为1
    per_device_eval_batch_size=1,   # 将评估批次大小设置为1
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    evaluation_strategy="steps",
    eval_steps=500,
)

# 将训练数据和测试数据转换为自定义数据集类型
train_dataset = TextClassificationDataset(train_inputs, train_masks, train_labels)
eval_dataset = TextClassificationDataset(test_inputs, test_masks, test_labels)

# 定义数据收集器和训练器
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

device = torch.device("cpu")
model.to(device)

# 训练模型
trainer.train()

# 评估模型
preds = trainer.predict(eval_dataset)
pred_labels = np.argmax(preds.predictions, axis=1)
true_labels = test_labels.numpy()
print(classification_report(true_labels, pred_labels))



Downloading (…)lve/main/config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/421M [00:00<?, ?B/s]

Some weights of the model checkpoint at ckiplab/gpt2-base-chinese were not used when initializing GPT2ForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at ckiplab/gpt2-base-chinese and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/174 [00:00<?, ?B/s]

Some weights of the model checkpoint at ckiplab/gpt2-base-chinese were not used when initializing GPT2ForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at ckiplab/gpt2-base-chinese and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
500,1.3216,2.879526
1000,1.3802,1.39087
1500,0.6004,0.685067
2000,0.0659,1.15695
2500,0.8516,0.988568


              precision    recall  f1-score   support

           0       0.80      0.87      0.83       298
           1       0.86      0.79      0.82       302

    accuracy                           0.83       600
   macro avg       0.83      0.83      0.83       600
weighted avg       0.83      0.83      0.83       600



In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import GPT2ForSequenceClassification, GPT2Config
from transformers import BertTokenizer
from transformers import TrainingArguments, Trainer
from torch.utils.data import Dataset
from transformers import DataCollatorWithPadding

# 读取数据
df = pd.read_excel("Hotel2000.xlsx", header=None)

# 定义自定义数据集类型
class TextClassificationDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': self.labels[idx],
        }

# 读取文本和标签数据
texts = df[0].tolist()
labels = df[5].tolist()


tokenizer = GPT2Tokenizer.from_pretrained("ckiplab/gpt2-base-chinese")
tokenizer.pad_token = tokenizer.eos_token #设置了 pad_token 为 eos_token，因为 GPT-2 默认没有 pad_token。这将确保与 GPT-2 模型的一致性。

input_encodings = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
input_ids = input_encodings["input_ids"]
attention_mask = input_encodings["attention_mask"]


labels = torch.tensor(labels)


train_inputs, test_inputs, train_labels, test_labels = train_test_split(input_ids, labels, random_state=42, test_size=0.3)
train_masks, test_masks, _, _ = train_test_split(attention_mask, labels, random_state=42, test_size=0.3)


config = GPT2Config(vocab_size=tokenizer.vocab_size, num_labels=len(set(labels)))
model = GPT2ForSequenceClassification(config=config)


training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    evaluation_strategy="steps",
    eval_steps=500,
)


train_dataset = TextClassificationDataset(train_inputs, train_masks, train_labels)
eval_dataset = TextClassificationDataset(test_inputs, test_masks, test_labels)


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

device = torch.device("cpu")
model.to(device)


trainer.train()


preds = trainer.predict(eval_dataset)
pred_labels = np.argmax(preds.predictions, axis=1)
true_labels = test_labels.numpy()
print(classification_report(true_labels, pred_labels))




Step,Training Loss,Validation Loss
500,2.0408,1.734604
1000,1.3775,1.324381
1500,1.1098,0.96176
2000,0.1867,1.117057
2500,0.7204,0.820944


              precision    recall  f1-score   support

           0       0.83      0.88      0.85       298
           1       0.87      0.82      0.85       302

    accuracy                           0.85       600
   macro avg       0.85      0.85      0.85       600
weighted avg       0.85      0.85      0.85       600



In [None]:
def perturb_sentence(sentence, num_perturbations):
    perturbed_sentences = []
    words = sentence.split()
    num_words = len(words)
    
    for _ in range(num_perturbations):
        perturbed_words = words.copy()
        num_words_to_remove = random.randint(1, num_words//2)
        indices_to_remove = random.sample(range(num_words), num_words_to_remove)
        
        for index in sorted(indices_to_remove, reverse=True):
            del perturbed_words[index]
        
        perturbed_sentence = ' '.join(perturbed_words)
        perturbed_sentences.append(perturbed_sentence)
    
    return perturbed_sentences

In [None]:
from tqdm import tqdm
data=[]
for k in tqdm(range(len(df))):
    sentence = df['final_cleaned_no_upper'].iloc[k]
    num_perturbations = 1000
    perturbed_sentences = perturb_sentence(sentence, num_perturbations)
    
    perturbed_sentences_tfidf = vectorizer.transform(perturbed_sentences)
    svm_classifier = SVC(probability=True)
    svm_classifier.fit(X_train, y_train)
    predictions_proba = svm_classifier.predict_proba(perturbed_sentences_tfidf)
    predictions_proba_class_1 = predictions_proba[:, 0]
#     for i, prediction_proba in enumerate(predictions_proba_class_1[:10]):
#         print(f"预测为1的概率 {i + 1}: {prediction_proba}")
    predictions_proba_class_1_array = np.array(predictions_proba_class_1)
    
    from sklearn.metrics.pairwise import cosine_similarity
    from scipy.spatial.distance import cdist
    original_sentence_tfidf = vectorizer.transform([sentence])
    cosine_similarities = cosine_similarity(original_sentence_tfidf, perturbed_sentences_tfidf)
    kernel_width = 0.25
    rbf_similarities = np.exp(-0.5 * (1 - cosine_similarities) ** 2 / kernel_width**2)
#     for i, similarity in enumerate(rbf_similarities[0][:10]):
#         print(f"调整后的相似性 {i + 1}: {similarity}")
    rbf_similarities_array = np.array(rbf_similarities)
    
    from sklearn.linear_model import LinearRegression
    sentence_tokens = sentence.split()
    attributes_matrix = np.zeros((len(perturbed_sentences), len(sentence_tokens)))
    for i, perturbed_sentence in enumerate(perturbed_sentences):
        perturbed_sentence_tokens = perturbed_sentence.split()
        for j, original_token in enumerate(sentence_tokens):
            if original_token in perturbed_sentence_tokens:
                attributes_matrix[i, j] = 1
    rbf_similarities_array = np.array(rbf_similarities[0])
    linear_regression = LinearRegression()
    linear_regression.fit(attributes_matrix, predictions_proba_class_1_array, sample_weight=rbf_similarities_array)
    
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.model_selection import train_test_split
    from sklearn.multioutput import MultiOutputRegressor
    regr_rf=RandomForestRegressor()
    regr_rf.fit(X=attributes_matrix, y=predictions_proba_class_1_array,sample_weight=rbf_similarities_array)
    importances1 = regr_rf.feature_importances_
    
    from sklearn import tree
    treemodel=tree.DecisionTreeRegressor(max_depth=5)
    treemodel.fit(attributes_matrix,predictions_proba_class_1_array,sample_weight=rbf_similarities_array)
    importances = treemodel.feature_importances_
    
    import xgboost as xgb
    xgbrModel=xgb.XGBRegressor()
    xgbrModel.fit(attributes_matrix,predictions_proba_class_1_array,sample_weight=rbf_similarities_array)
    importances2=xgbrModel.feature_importances_
    
    
    word_coef_pairs = list(zip(sentence_tokens, linear_regression.coef_))
    sorted_word_coef_pairs = sorted([(word, abs(coef)) for word, coef in word_coef_pairs], key=lambda x: x[1], reverse=True)
    if pd.isna(df.iloc[k, 3]):
        linear_score=0
        importance0_score=0
        importance1_score=0
        importance2_score=0
        linear_most_important_words='该栏位没有词'
        importance0_most_important_words='该栏位没有词'
        importance1_most_important_words='该栏位没有词'
        importance2_most_important_words='该栏位没有词'
    else:
        value_of_df_4 = df.iloc[k, 3]
        tokens_of_df_4 = value_of_df_4.split()
        num_of_words = len(tokens_of_df_4)
        most_important_words = [pair[0] for pair in sorted_word_coef_pairs[:num_of_words]]
        matched_words = [word for word in tokens_of_df_4 if word in most_important_words]
        linear_score = len(matched_words) / num_of_words * 100
        linear_most_important_words=','.join(most_important_words)

        words = [pair[0] for pair in sorted_word_coef_pairs]
        word_importance_pairs = list(zip(words, importances))
        word_importance1_pairs = list(zip(words, importances1))
        word_importance2_pairs = list(zip(words, importances2))
        sorted_word_importance_pairs = sorted(word_importance_pairs, key=lambda x: -abs(x[1]))
        sorted_word_importance1_pairs = sorted(word_importance1_pairs, key=lambda x: -abs(x[1]))
        sorted_word_importance2_pairs = sorted(word_importance2_pairs, key=lambda x: -abs(x[1]))
        sorted_word_importance_pairs_list = [
            sorted_word_importance_pairs,
            sorted_word_importance1_pairs,
            sorted_word_importance2_pairs
        ]
        results = []
        for sorted_word_importance_pairs in sorted_word_importance_pairs_list:
            most_important_words = [pair[0] for pair in sorted_word_importance_pairs[:num_of_words]]
            matched_words = [word for word in tokens_of_df_4 if word in most_important_words]
            score = len(matched_words) / num_of_words * 100
            results.append((score, most_important_words))
        importance0_score, importance0_most_important_words=results[0]
        importance0_most_important_words=','.join(importance0_most_important_words)
        importance1_score, importance1_most_important_words=results[1]
        importance1_most_important_words=','.join(importance1_most_important_words)
        importance2_score, importance2_most_important_words=results[2]
        importance2_most_important_words=','.join(importance2_most_important_words)
    data.append([linear_score,linear_most_important_words,
                importance0_score,importance0_most_important_words,
                importance1_score, importance1_most_important_words,
                importance2_score,importance2_most_important_words])

result_df=pd.DataFrame(data=data,columns=['linear_score','linear_most_important_words',
                               'importance0_score','importance0_most_important_words',
                               'importance1_score','importance1_most_important_words',
                               'importance2_score','importance2_most_important_words']) 
result_df.to_csv('results.csv',index=False,encoding='utf_8_sig')