In [5]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import json
import pandas as pd
import os

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [6]:
# load all resource files
path = 'cleanedDCinboxResource/'
dest_path = 'encodedDCinboxResource/'
dirs = os.listdir(path)
dirs

['healthcare.json.csv',
 'pet.json.csv',
 'business.json.csv',
 'fashion beauty.json.csv',
 'fitness sports.json.csv',
 'food beverage.json.csv',
 'media enterainment.json.csv',
 'lifecycle home.json.csv',
 'consumer.json.csv',
 'green cleantech.json.csv',
 'children education.json.csv',
 'software tech.json.csv']

In [4]:
# load the csv data into the dataframe
df_dict = {}
for file in dirs:
    df_dict[file] = pd.read_csv(path + file)
    
# test if the data be read or not
df_dict['healthcare.json.csv'].head()

Unnamed: 0,sentences
0,Congressman Carol Miller ?
1,"Dear Friend, Happy Presidents' Day weekend!"
2,On this day we honor all that our Presidents h...
3,This week was filled with many exciting develo...
4,I introduced legislation to fill the remaining...


In [5]:
print(df_dict['healthcare.json.csv']['sentences'].to_numpy())

['Congressman Carol Miller ?'
 "Dear Friend, Happy Presidents' Day weekend!"
 'On this day we honor all that our Presidents have done to lead our nation into the prosperous and powerful one it is today.'
 ...
 'There would never be enough time in the world to fully encapsulate what he meant to me and the people of Michigans Thirteenth Congressional District, but please watch my full address in honor of his tremendous legacy and know that it is that tremendous legacy that fuels my work on your behalf every single day.'
 'My sincere condolences go out to his family, friends, and loved ones.'
 'May Congressman Conyers rest in peace as we continue his fight for jobs, justice, and peace.']


In [6]:
for (key, data) in df_dict.items():
    # data['embeded'] = embed(data['sentences'].to_numpy()).numpy()
    data['embed'] = embed(data['sentences'].to_numpy()).numpy().tolist()

In [7]:
df_dict['healthcare.json.csv'].head()

Unnamed: 0,sentences,embed
0,Congressman Carol Miller ?,"[0.05492091551423073, 0.023301545530557632, 0...."
1,"Dear Friend, Happy Presidents' Day weekend!","[-0.03798495605587959, -0.06996205449104309, 0..."
2,On this day we honor all that our Presidents h...,"[-0.06237169727683067, -0.08522521704435349, 0..."
3,This week was filled with many exciting develo...,"[-0.04813029244542122, -0.04647025093436241, -..."
4,I introduced legislation to fill the remaining...,"[-0.04485123232007027, -0.07864920049905777, -..."


In [8]:
for (key, data) in df_dict.items():
    data.to_csv(dest_path + key , index=False)

In [9]:
frames = []
for (key, data) in df_dict.items():
    data['topic'] = key.replace('.json.csv','')
    frames.append(data)

df = pd.concat(frames)


In [10]:
df.head()

Unnamed: 0,sentences,embed,topic
0,Congressman Carol Miller ?,"[0.05492091551423073, 0.023301545530557632, 0....",healthcare
1,"Dear Friend, Happy Presidents' Day weekend!","[-0.03798495605587959, -0.06996205449104309, 0...",healthcare
2,On this day we honor all that our Presidents h...,"[-0.06237169727683067, -0.08522521704435349, 0...",healthcare
3,This week was filled with many exciting develo...,"[-0.04813029244542122, -0.04647025093436241, -...",healthcare
4,I introduced legislation to fill the remaining...,"[-0.04485123232007027, -0.07864920049905777, -...",healthcare


In [11]:
df.tail()

Unnamed: 0,sentences,embed,topic
3660,The company has a wide range of involvement in...,"[0.02320779860019684, 0.06694994866847992, 0.0...",software tech
3661,Treeline is a prime example of how the forest ...,"[0.02034887485206127, 0.02548053488135338, 0.0...",software tech
3662,Through sustainable and responsible forest man...,"[-0.021101249381899834, 0.05313967540860176, -...",software tech
3663,I had a great discussion with Brian about the ...,"[-0.047962334007024765, -0.06889202445745468, ...",software tech
3664,Click HERE to learn more.,"[-0.07666020095348358, -0.08758419752120972, 0...",software tech


### transform the topic to specific number

In [29]:
topics = map(lambda x: x.replace('.json.csv',''), dirs) 
topics = list(topics)
topics_df = pd.DataFrame(topics)
topics_index = list(topics_df.index)
topics_index

topic_dict = dict(zip(topics, topics_index))
topic_dict

{'healthcare': 0,
 'pet': 1,
 'business': 2,
 'fashion beauty': 3,
 'fitness sports': 4,
 'food beverage': 5,
 'media enterainment': 6,
 'lifecycle home': 7,
 'consumer': 8,
 'green cleantech': 9,
 'children education': 10,
 'software tech': 11}

### Generate X_train

In [18]:
X_train = df.copy(deep=True) # copy the dataframe 
X_train = X_train[['embed']]

X_train = pd.DataFrame(X_train['embed'].tolist(), columns=range(0, 512))
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,0.054921,0.023302,0.020982,-0.026948,-0.023186,-0.065200,-0.058983,-0.035443,0.048876,-0.046086,...,-0.006283,-0.028407,0.003497,-0.021077,-0.052233,-0.006062,-0.033835,0.056937,0.031446,0.074664
1,-0.037985,-0.069962,0.037102,0.048221,0.053405,0.058545,-0.050000,-0.013386,-0.034736,-0.045103,...,0.001811,0.020552,0.066109,0.010746,-0.039018,-0.012361,0.022109,-0.010583,-0.054189,0.020036
2,-0.062372,-0.085225,0.038230,0.006329,-0.008876,0.073339,0.016989,-0.016803,-0.003109,0.001256,...,0.006282,-0.033230,0.001216,-0.035546,0.016305,-0.069788,-0.025516,-0.056390,-0.004127,0.027242
3,-0.048130,-0.046470,-0.015697,-0.013564,-0.095009,0.039518,0.052138,-0.023493,-0.004937,-0.018788,...,0.075226,0.078493,0.040549,-0.002126,0.020539,-0.067325,-0.006475,-0.041448,-0.033357,0.042049
4,-0.044851,-0.078649,-0.023230,-0.033329,-0.024911,-0.073252,-0.021818,-0.016139,-0.032433,-0.080862,...,0.015235,-0.041002,0.026818,-0.065467,0.016234,0.016308,0.038927,-0.000463,-0.031922,0.050660
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45688,0.023208,0.066950,0.002083,-0.033706,0.027408,-0.083156,-0.018206,-0.035894,0.069844,-0.007374,...,0.007152,-0.049285,0.004002,0.003757,-0.085870,-0.053907,0.021028,-0.006023,0.058470,0.010841
45689,0.020349,0.025481,0.055837,0.065053,0.028759,-0.062801,0.001884,-0.045562,0.055871,0.016935,...,0.018313,-0.000405,-0.061381,0.044287,-0.000373,-0.040991,0.030439,-0.016256,0.032974,0.040542
45690,-0.021101,0.053140,-0.007118,-0.037430,0.062071,-0.062945,0.072324,-0.075389,0.030979,-0.051241,...,-0.010869,0.002352,-0.020928,-0.035992,-0.071776,-0.045672,0.019251,0.006642,0.014855,0.079571
45691,-0.047962,-0.068892,0.051299,0.018923,-0.016479,-0.090196,-0.022615,-0.048159,0.049300,-0.058850,...,-0.073839,-0.074018,-0.066652,0.030720,-0.016924,-0.069269,0.002934,0.054469,0.015319,0.026989


In [16]:
X_train.to_csv('X_train.csv', index=False)

### Generate Y_train

In [22]:
y_train = df.copy(deep=True)
y_train = y_train[['topic']]
#y_train = pd.concat([y_train,pd.get_dummies(y_train['topic'],dummy_na=True)],axis=1).drop(['topic'],axis=1)
y_train = y_train['topic'].map(topic_dict)
y_train_numpy = y_train.to_numpy()

In [24]:
y_train.to_csv('y_train.csv', index=False)
from numpy import savetxt
savetxt('y_train_numpy.csv', y_train_numpy, delimiter=',')

  """Entry point for launching an IPython kernel.


In [30]:
pd.DataFrame({0: topics}).to_csv('topics.csv')

###  Train Data usng sklearn Regression

In [15]:
from sklearn import svm
clf = svm.SVR()
clf.fit(X = X_train, y = y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [25]:
from joblib import dump, load

In [None]:
dump(clf, 'encode.joblib') 

In [27]:
clf = load('encode.joblib')

In [None]:
Y_pridict = clf.predict(X_train)